From cee9c5eeb1a5d95339affd384c82e582c0fedbc5 Mon Sep 17 00:00:00 2001
From: Noel Chalmers <noel.chalmers@gmail.com>
Date: Tue, 7 Jun 2022 19:21:50 -0500
Subject: [PATCH] Update to v0.5.0 (#78)

* Update OCCA

* Remove gslib

* Upgrade to newest ogs

* Add new comm_t object

* Add new memory objects

* Add new timers

* Move matrix routines into linAlg

* Rename file

* [Core] Update core lib

* [LinAlg] Update linAlg lib

* [LinAlg] Rename some source files

* [Core] Remove ambiguous scan overload

* [OCCA] Update to latest OCCA

* [Mesh] Move some mesh files

* [ParAdogs] Add parAdogs mesh partitioner

* [Mesh] Update mesh library

* [Core] Updates to core library

* [LinAlg] Update linear algebra library

* [TimeStepper] Update timeStepper library

* [LinearSolver] Update linearSolver library

* [ParAlmond] Update parAlmond library

* [Make] Makefile updates

* [Advection] Update advection solver

* [Acoustics] Update acoustics solver

* [Gradient] Update gradient solver

* [Elliptic] Update elliptic solver

* [CNS] Update compressible navier stokes solver

* [Test] Some test tweaks

* [LBS] Update DG Lattice Boltzmann solver

* [LBS] Makefile tweak

* [BNS] Update Galerkin Boltzmann Navier-Stokes solver

* [FPE] Update Fokker-Planck solver

* [INS] Update incompressible Navier-Stokes solver

* [LinAlg] LinAlg fixes

* [InitialGuess] FIxes in initial guess strategies

* [Mesh] Initialize mapB to -1

* [Mesh] mapB in mesh_t class

* [Mesh] Bug fix in cubature setup

* [Mesh] Bugifx in multirate setup

* [ParAlmond] Switch to shared_ptr to address mem leak

* [Test] Tweak one golden norm

* [Test] Add ParAdogs tests

* [Solvers] Switch to strings for filenames and kernelname to avoid memory leaks

* Version bump

* Happy New Year

* Need some manual barriers

* [Solvers] Trigger ogs kernel builds during setups

* [Git] Ignore rc files made by tests

* [Make] Fix default openblas-serial lib path

* [Git] Use openblas-serial in git workflow

* [Comm] Move some things out of the comm_t class, and into a namespace

* [Mesh] Make the element type an enum

* [Make] Missing code coverage flags

* [OGS] Add some more explicit instantiations to workaround issue in older gcc

* [Timer] Can't use a the plaform's comm at the end of ogsSetup. Adding a new timer with comm argument

* Update README.md

* Add code diagram

* Update README.md

* [LinearSolver] Bugfix for uninitialized memory

* [Libs][Solvers] Small fixes for building in FP32 mode

* [Core] Remove repeated compiler flags

* [Core] Workaround for occa dtypes being empty for user-types

* [ParAdogs] Reduce a termination tolerance when in FP32 mode

* [ParAdogs] Typo

* [OCCA] Switch to OCCA dev branch

* [Core] Add a check to not exceed the max thread count OpenMP reports

* [OGS] Add some restricts to pointers for host operators

* [OCCA] Fix properties syntax

* [OGS] Add special code paths for scalar ogs ops

* Add some notes on CPU binding to the README

* [Acoustics] Hide more comm time with the surface kernel

* [OGS] Fix final row block entry being too large

* [OCCA] Update to OCCA v1.3
---
 .github/CodeDiagram.png                       |  Bin 0 -> 43595 bytes
 .github/workflows/build.yml                   |    4 +-
 .gitignore                                    |    3 +-
 3rdParty/gslib/.travis.yml                    |   36 -
 3rdParty/gslib/LICENSE                        |   58 -
 3rdParty/gslib/Makefile                       |  187 --
 3rdParty/gslib/README.md                      |   23 -
 3rdParty/gslib/RELEASE.md                     |   17 -
 3rdParty/gslib/cdep.py                        |   33 -
 3rdParty/gslib/makefile.cdep                  |   42 -
 3rdParty/gslib/odep_info.py                   |   50 -
 3rdParty/gslib/src/c99.h                      |   16 -
 3rdParty/gslib/src/comm.c                     |  210 --
 3rdParty/gslib/src/comm.h                     |  259 ---
 3rdParty/gslib/src/crs.h                      |   36 -
 3rdParty/gslib/src/crystal.c                  |  141 --
 3rdParty/gslib/src/crystal.h                  |   21 -
 3rdParty/gslib/src/fail.c                     |   63 -
 3rdParty/gslib/src/fail.h                     |   52 -
 3rdParty/gslib/src/fcrystal.c                 |  191 --
 3rdParty/gslib/src/findpts.c                  |  369 ----
 3rdParty/gslib/src/findpts.h                  |   73 -
 3rdParty/gslib/src/findpts_el.h               |  122 --
 3rdParty/gslib/src/findpts_el_2.c             |  819 -------
 3rdParty/gslib/src/findpts_el_3.c             | 1318 -----------
 3rdParty/gslib/src/findpts_imp.h              |  470 ----
 3rdParty/gslib/src/findpts_local.c            |   52 -
 3rdParty/gslib/src/findpts_local.h            |   96 -
 3rdParty/gslib/src/findpts_local_imp.h        |  388 ----
 3rdParty/gslib/src/gen_poly_imp.c             |  226 --
 3rdParty/gslib/src/gs.c                       | 1651 --------------
 3rdParty/gslib/src/gs.h                       |  155 --
 3rdParty/gslib/src/gs_defs.h                  |   81 -
 3rdParty/gslib/src/gs_local.c                 |  336 ---
 3rdParty/gslib/src/gs_local.h                 |   43 -
 3rdParty/gslib/src/gslib.h                    |   20 -
 3rdParty/gslib/src/lob_bnd.c                  |  285 ---
 3rdParty/gslib/src/lob_bnd.h                  |  111 -
 3rdParty/gslib/src/mem.h                      |  168 --
 3rdParty/gslib/src/name.h                     |   44 -
 3rdParty/gslib/src/obbox.c                    |  341 ---
 3rdParty/gslib/src/obbox.h                    |  113 -
 3rdParty/gslib/src/poly.c                     |  236 --
 3rdParty/gslib/src/poly.h                     |   65 -
 3rdParty/gslib/src/poly_imp.h                 | 1949 -----------------
 3rdParty/gslib/src/rand_elt_test.c            |  169 --
 3rdParty/gslib/src/rand_elt_test.h            |   18 -
 3rdParty/gslib/src/sarray_sort.c              |   45 -
 3rdParty/gslib/src/sarray_sort.h              |   89 -
 3rdParty/gslib/src/sarray_transfer.c          |  198 --
 3rdParty/gslib/src/sarray_transfer.h          |   95 -
 3rdParty/gslib/src/sort.c                     |   31 -
 3rdParty/gslib/src/sort.h                     |   76 -
 3rdParty/gslib/src/sort_imp.h                 |  544 -----
 3rdParty/gslib/src/tensor.c                   |   82 -
 3rdParty/gslib/src/tensor.h                   |  199 --
 3rdParty/gslib/src/types.h                    |   85 -
 3rdParty/gslib/tests/comm_test.c              |   37 -
 3rdParty/gslib/tests/crystal_test.c           |   88 -
 3rdParty/gslib/tests/findpts_el_2_test.c      |   73 -
 3rdParty/gslib/tests/findpts_el_2_test2.c     |   97 -
 3rdParty/gslib/tests/findpts_el_3_test.c      |   77 -
 3rdParty/gslib/tests/findpts_el_3_test2.c     |  107 -
 3rdParty/gslib/tests/findpts_local_test.c     |  210 --
 3rdParty/gslib/tests/findpts_test.c           |  328 ---
 3rdParty/gslib/tests/fortran/f-igs.f          |   59 -
 3rdParty/gslib/tests/gs_test.c                |  133 --
 3rdParty/gslib/tests/gs_test_gop_blocking.c   |  107 -
 .../gslib/tests/gs_test_gop_nonblocking.c     |  131 --
 3rdParty/gslib/tests/gs_test_old.c            |  148 --
 3rdParty/gslib/tests/gs_unique_test.c         |   87 -
 3rdParty/gslib/tests/lob_bnd_test.c           |  185 --
 3rdParty/gslib/tests/obbox_test.c             |  207 --
 3rdParty/gslib/tests/poly_test.c              |   23 -
 3rdParty/gslib/tests/run_tests.sh             |   21 -
 3rdParty/gslib/tests/sarray_sort_test.c       |   47 -
 3rdParty/gslib/tests/sarray_transfer_test.c   |   93 -
 3rdParty/gslib/tests/sort_test.c              |  113 -
 3rdParty/gslib/tests/sort_test2.c             |   58 -
 LICENSE                                       |    2 +-
 README.md                                     |   87 +-
 include/comm.hpp                              |  565 +++++
 include/core.hpp                              |   58 +-
 include/initialGuess.hpp                      |  148 +-
 include/linAlg.hpp                            |  189 +-
 include/linearSolver.hpp                      |  188 +-
 include/memory.hpp                            |  778 +++++++
 include/mesh.hpp                              | 1338 +++++++----
 include/mesh/mesh2D.hpp                       |   94 -
 include/mesh/mesh3D.hpp                       |  141 --
 include/mesh/meshDefines3D.h                  |   99 -
 include/ogs.hpp                               |  551 +++--
 include/ogs/ogsBase.hpp                       |  112 +
 include/ogs/ogsDefs.h                         |   65 -
 include/ogs/ogsExchange.hpp                   |  336 +++
 include/ogs/ogsKernels.hpp                    |  203 --
 include/ogs/ogsOperator.hpp                   |  147 ++
 include/ogs/ogsUtils.hpp                      |   87 +
 .../operator.hpp                              |   30 +-
 .../parAdogs.hpp                              |   76 +-
 include/parAdogs/parAdogsGraph.hpp            |  159 ++
 include/parAdogs/parAdogsMatrix.hpp           |  128 ++
 include/parAdogs/parAdogsMultigrid.hpp        |  121 +
 .../parAdogsPartition.hpp}                    |   44 +-
 include/parAlmond.hpp                         |  189 +-
 include/parAlmond/parAlmondAMGLevel.hpp       |   27 +-
 include/parAlmond/parAlmondAMGSetup.hpp       |   48 +-
 include/parAlmond/parAlmondCoarseSolver.hpp   |   71 +-
 include/parAlmond/parAlmondDefines.hpp        |   18 +-
 include/parAlmond/parAlmondKernels.hpp        |   47 +-
 include/parAlmond/parAlmondMultigrid.hpp      |  123 --
 include/parAlmond/parAlmondparCSR.hpp         |  103 +-
 include/platform.hpp                          |  198 +-
 include/precon.hpp                            |   38 +-
 include/settings.hpp                          |   53 +-
 include/solver.hpp                            |   75 +-
 include/timeStepper.hpp                       |  520 ++---
 include/timer.hpp                             |   59 +
 include/types.h                               |   16 +-
 include/utils.hpp                             |  109 +-
 libs/core/comm.cpp                            |  118 +
 libs/core/exception.cpp                       |   90 +
 libs/core/factor.cpp                          |   89 -
 libs/core/matrixEig.cpp                       |  178 --
 libs/core/matrixRightSolve.cpp                |  301 ---
 .../memory.cpp}                               |   35 +-
 libs/core/parallelSort.cpp                    |  145 --
 libs/core/platformBuildKernel.cpp             |   23 +-
 libs/core/platformDeviceConfig.cpp            |  159 +-
 libs/core/platformProperties.cpp              |   98 +-
 libs/core/platformSettings.cpp                |   19 +-
 libs/core/rankDecomp.cpp                      |  215 ++
 libs/core/settings.cpp                        |  101 +-
 .../core/timer.cpp                            |   70 +-
 libs/linAlg/linAlg.cpp                        |  175 +-
 .../linAlgMatrixConditionNumber.cpp}          |   92 +-
 libs/linAlg/linAlgMatrixEig.cpp               |  152 ++
 .../linAlgMatrixInverse.cpp}                  |   64 +-
 libs/linAlg/linAlgMatrixRightSolve.cpp        |  381 ++++
 .../linAlgMatrixTranspose.cpp}                |   46 +-
 libs/linAlg/linAlgSetup.cpp                   |  122 +-
 libs/linAlg/okl/linAlgADXPY.okl               |    4 +-
 libs/linAlg/okl/linAlgAMXPY.okl               |    2 +-
 libs/linAlg/okl/linAlgAXPY.okl                |    4 +-
 libs/linAlg/okl/linAlgAdd.okl                 |    2 +-
 libs/linAlg/okl/linAlgInnerProd.okl           |   69 +-
 libs/linAlg/okl/linAlgMax.okl                 |   72 +-
 libs/linAlg/okl/linAlgMin.okl                 |   72 +-
 libs/linAlg/okl/linAlgNorm2.okl               |   96 +-
 libs/linAlg/okl/linAlgScale.okl               |    2 +-
 libs/linAlg/okl/linAlgSet.okl                 |    2 +-
 libs/linAlg/okl/linAlgSum.okl                 |   71 +-
 libs/linAlg/okl/linAlgWeightedInnerProd.okl   |  132 +-
 libs/linAlg/okl/linAlgWeightedNorm2.okl       |    2 +-
 libs/linearSolver/initialGuess.cpp            |  351 +--
 libs/linearSolver/linearSolver.cpp            |   52 +-
 libs/linearSolver/linearSolverNBFPCG.cpp      |  152 +-
 libs/linearSolver/linearSolverNBPCG.cpp       |  128 +-
 libs/linearSolver/linearSolverPCG.cpp         |   88 +-
 libs/linearSolver/linearSolverPGMRES.cpp      |   71 +-
 libs/linearSolver/linearSolverPMINRES.cpp     |   71 +-
 .../linearSolver/okl/igBasisInnerProducts.okl |   11 +-
 libs/linearSolver/okl/igDropQRFirstColumn.okl |    6 +-
 libs/linearSolver/okl/igExtrap.okl            |    2 +-
 libs/linearSolver/okl/igReconstruct.okl       |    3 +-
 libs/linearSolver/okl/igScale.okl             |    2 +-
 libs/linearSolver/okl/igUpdate.okl            |    2 +-
 .../okl/linearSolverUpdateMINRES.okl          |    2 +-
 .../okl/linearSolverUpdateNBFPCG.okl          |   26 +-
 .../okl/linearSolverUpdateNBPCG.okl           |   26 +-
 .../okl/linearSolverUpdatePCG.okl             |   19 +-
 libs/makefile                                 |  101 +-
 libs/mesh/mesh.cpp                            |   67 -
 libs/mesh/meshBasis1D.cpp                     |  233 +-
 libs/mesh/meshBasisHex3D.cpp                  |  393 +++-
 libs/mesh/meshBasisQuad2D.cpp                 |  315 ++-
 libs/mesh/meshBasisTet3D.cpp                  |  688 ++++--
 libs/mesh/meshBasisTri2D.cpp                  |  685 +++---
 libs/mesh/meshConnect.cpp                     |  273 ++-
 libs/mesh/meshConnectBoundary.cpp             |   39 +-
 libs/mesh/meshConnectFaceNodes.cpp            |  103 +
 libs/mesh/meshConnectFaceNodes2D.cpp          |  138 --
 libs/mesh/meshConnectFaceNodes3D.cpp          |  145 --
 libs/mesh/meshConnectFaceVertices.cpp         |   66 +
 libs/mesh/meshConnectNodes.cpp                |  119 +
 libs/mesh/meshCubatureNodesHex3D.cpp          |   67 +-
 libs/mesh/meshCubatureNodesQuad2D.cpp         |   42 +-
 libs/mesh/meshCubatureNodesQuad3D.cpp         |   54 +-
 libs/mesh/meshCubatureNodesTet3D.cpp          |   42 +-
 libs/mesh/meshCubatureNodesTri2D.cpp          |   27 +-
 libs/mesh/meshCubatureNodesTri3D.cpp          |   33 +-
 libs/mesh/meshCubatureSetupHex3D.cpp          |  169 +-
 libs/mesh/meshCubatureSetupQuad2D.cpp         |   95 +-
 libs/mesh/meshCubatureSetupTet3D.cpp          |   84 +-
 libs/mesh/meshCubatureSetupTri2D.cpp          |   83 +-
 ...erSetup.cpp => meshGatherScatterSetup.cpp} |   78 +-
 libs/mesh/meshGeometricFactorsHex3D.cpp       |   87 +-
 libs/mesh/meshGeometricFactorsQuad2D.cpp      |   67 +-
 libs/mesh/meshGeometricFactorsQuad3D.cpp      |  475 ++--
 libs/mesh/meshGeometricFactorsTet3D.cpp       |   89 +-
 libs/mesh/meshGeometricFactorsTri2D.cpp       |   60 +-
 libs/mesh/meshGeometricFactorsTri3D.cpp       |   91 +-
 libs/mesh/meshGeometricPartition2D.cpp        |  396 ----
 libs/mesh/meshGeometricPartition3D.cpp        |  354 ---
 libs/mesh/meshHaloRingSetup.cpp               |  190 +-
 libs/mesh/meshHaloSetup.cpp                   |   46 +-
 libs/mesh/meshHaloTraceSetup.cpp              |   36 +-
 libs/mesh/meshMassMatrixApply.cpp             |   39 +-
 libs/mesh/meshMinCharacteristicLength.cpp     |   73 +-
 libs/mesh/meshMultiRateHaloTraceSetup.cpp     |   55 +-
 libs/mesh/meshMultiRateSetup.cpp              |   84 +-
 libs/mesh/meshOccaSetup.cpp                   |   62 -
 libs/mesh/meshOccaSetup2D.cpp                 |   60 -
 libs/mesh/meshOccaSetup3D.cpp                 |   77 -
 libs/mesh/meshOccaSetupHex3D.cpp              |   88 -
 libs/mesh/meshOccaSetupTet3D.cpp              |   80 -
 libs/mesh/meshOccaSetupTri2D.cpp              |   72 -
 libs/mesh/meshOccaSetupTri3D.cpp              |   72 -
 libs/mesh/meshParallelConnectNodes.cpp        |  110 -
 libs/mesh/meshParallelConnectOpt.cpp          |  233 --
 libs/mesh/meshParallelReaderQuad3D.cpp        |  227 --
 libs/mesh/meshParallelReaderTri3D.cpp         |  223 --
 libs/mesh/meshPartition.cpp                   |   51 +
 libs/mesh/meshPhysicalNodesHex3D.cpp          |   31 +-
 libs/mesh/meshPhysicalNodesQuad2D.cpp         |   26 +-
 libs/mesh/meshPhysicalNodesQuad3D.cpp         |   31 +-
 libs/mesh/meshPhysicalNodesTet3D.cpp          |   30 +-
 libs/mesh/meshPhysicalNodesTri2D.cpp          |   25 +-
 libs/mesh/meshPhysicalNodesTri3D.cpp          |   31 +-
 libs/mesh/meshPlotInterpHex3D.cpp             |   33 +-
 libs/mesh/meshPlotInterpQuad2D.cpp            |   27 +-
 libs/mesh/meshPlotInterpQuad3D.cpp            |   78 -
 libs/mesh/meshPlotInterpTet3D.cpp             |    9 +-
 libs/mesh/meshPlotInterpTri2D.cpp             |    9 +-
 libs/mesh/meshPmlSetup.cpp                    |   55 +-
 ...lReaderHex3D.cpp => meshReadGmshHex3D.cpp} |  123 +-
 ...eaderQuad2D.cpp => meshReadGmshQuad2D.cpp} |  118 +-
 libs/mesh/meshReadGmshQuad3D.cpp              |  189 ++
 ...lReaderTet3D.cpp => meshReadGmshTet3D.cpp} |  121 +-
 ...lReaderTri2D.cpp => meshReadGmshTri2D.cpp} |  118 +-
 libs/mesh/meshReadGmshTri3D.cpp               |  184 ++
 libs/mesh/meshReferenceNodesHex3D.cpp         |   49 +-
 libs/mesh/meshReferenceNodesQuad2D.cpp        |   54 +-
 libs/mesh/meshReferenceNodesTet3D.cpp         |   79 +-
 libs/mesh/meshReferenceNodesTri2D.cpp         |   74 +-
 libs/mesh/meshSetElementType.cpp              |   90 +
 libs/mesh/meshSettings.cpp                    |   20 +-
 libs/mesh/meshSetup.cpp                       |  110 +-
 libs/mesh/meshSetupBoxHex3D.cpp               |   67 +-
 libs/mesh/meshSetupBoxQuad2D.cpp              |   60 +-
 libs/mesh/meshSetupBoxTet3D.cpp               |   55 +-
 libs/mesh/meshSetupBoxTri2D.cpp               |   55 +-
 libs/mesh/meshSetupNewDegree.cpp              |  114 +-
 libs/mesh/meshSetupPmlBoxHex3D.cpp            |   76 +-
 libs/mesh/meshSetupPmlBoxQuad2D.cpp           |   93 +-
 libs/mesh/meshSetupPmlBoxTet3D.cpp            |   67 +-
 libs/mesh/meshSetupPmlBoxTri2D.cpp            |   57 +-
 libs/mesh/meshSetupRingPatch.cpp              |  204 +-
 libs/mesh/meshSetupSEMFEM.cpp                 |  259 +--
 .../mesh/meshSurfaceGeometricFactorsHex3D.cpp |  138 +-
 .../meshSurfaceGeometricFactorsQuad2D.cpp     |   79 +-
 .../meshSurfaceGeometricFactorsQuad3D.cpp     |  358 +--
 .../mesh/meshSurfaceGeometricFactorsTet3D.cpp |  121 +-
 .../mesh/meshSurfaceGeometricFactorsTri2D.cpp |   99 +-
 .../mesh/meshSurfaceGeometricFactorsTri3D.cpp |  174 +-
 libs/mesh/okl/MassMatrixOperatorHex3D.okl     |    9 +-
 libs/mesh/okl/MassMatrixOperatorQuad2D.okl    |    9 +-
 libs/mesh/okl/MassMatrixOperatorTet3D.okl     |    9 +-
 libs/mesh/okl/MassMatrixOperatorTri2D.okl     |    9 +-
 libs/ogs/gs.cpp                               |  132 --
 libs/ogs/hostGather.cpp                       |  164 --
 libs/ogs/hostGatherScatter.cpp                |  180 --
 libs/ogs/hostScatter.cpp                      |  155 --
 libs/ogs/occaGather.cpp                       |  170 --
 libs/ogs/occaGatherScatter.cpp                |  176 --
 libs/ogs/occaGatheredHaloExchange.cpp         |  123 --
 libs/ogs/occaScatter.cpp                      |  165 --
 libs/ogs/ogs.cpp                              |  638 ++++--
 libs/ogs/ogsAllToAll.cpp                      |  358 +++
 libs/ogs/ogsAuto.cpp                          |  349 +++
 libs/ogs/ogsCrystalRouter.cpp                 |  775 +++++++
 libs/ogs/ogsHalo.cpp                          |  395 ++++
 libs/ogs/ogsKernels.cpp                       |  132 --
 libs/ogs/ogsOperator.cpp                      |  635 ++++++
 libs/ogs/ogsPairwise.cpp                      |  430 ++++
 libs/ogs/ogsSetup.cpp                         | 1184 ++++++----
 libs/ogs/ogsUtils.cpp                         |  127 ++
 libs/ogs/okl/gatherScatter.okl                |  185 --
 libs/ogs/okl/ogsKernels.okl                   |  177 ++
 libs/parAdogs/parAdogsConnect.cpp             |  242 ++
 libs/parAdogs/parAdogsCuthillMckee.cpp        |  153 ++
 libs/parAdogs/parAdogsFiedlerVector.cpp       |  176 ++
 libs/parAdogs/parAdogsGraph.cpp               |  426 ++++
 libs/parAdogs/parAdogsInertialBipartition.cpp |  201 ++
 libs/parAdogs/parAdogsInertialPartition.cpp   |   60 +
 libs/parAdogs/parAdogsMatrix.cpp              |  416 ++++
 libs/parAdogs/parAdogsMeshPartition.cpp       |  114 +
 libs/parAdogs/parAdogsMultigrid.cpp           |   94 +
 libs/parAdogs/parAdogsMultigridAggregate.cpp  |  315 +++
 .../parAdogsMultigridCoarseSolver.cpp         |  148 ++
 libs/parAdogs/parAdogsMultigridLaplacian.cpp  |  166 ++
 libs/parAdogs/parAdogsMultigridSetup.cpp      |  190 ++
 libs/parAdogs/parAdogsMultigridSmooth.cpp     |  177 ++
 .../parAdogsMultigridSmoothPrologator.cpp     |  332 +++
 libs/parAdogs/parAdogsMultigridSpMM.cpp       |  294 +++
 .../parAdogsMultigridTentativeProlongator.cpp |  101 +
 libs/parAdogs/parAdogsMultigridTranspose.cpp  |  177 ++
 libs/parAdogs/parAdogsParallelPivot.cpp       |  105 +
 libs/parAdogs/parAdogsRefine.cpp              |  141 ++
 .../parAdogsSettings.cpp}                     |   30 +-
 libs/parAdogs/parAdogsSolve.cpp               |  131 ++
 libs/parAdogs/parAdogsSpectralBipartition.cpp |   73 +
 libs/parAdogs/parAdogsSpectralPartition.cpp   |   61 +
 libs/parAlmond/okl/SmoothChebyshev.okl        |    4 +-
 libs/parAlmond/okl/SmoothJacobi.okl           |    4 +-
 libs/parAlmond/okl/SpMVcsr.okl                |    2 +-
 libs/parAlmond/okl/SpMVmcsr.okl               |    2 +-
 libs/parAlmond/okl/dGEMV.okl                  |    2 +-
 libs/parAlmond/okl/kcycleCombinedOp.okl       |   44 +-
 libs/parAlmond/okl/vectorAddInnerProd.okl     |   24 +-
 libs/parAlmond/parAlmond.cpp                  |   52 +-
 libs/parAlmond/parAlmondAMGLevel.cpp          |  110 +-
 libs/parAlmond/parAlmondAMGSetup.cpp          |  116 +-
 libs/parAlmond/parAlmondAMGSmoother.cpp       |   82 +-
 libs/parAlmond/parAlmondCoarseExact.cpp       |  190 +-
 libs/parAlmond/parAlmondCoarseOAS.cpp         |  221 +-
 libs/parAlmond/parAlmondCoarsenLevel.cpp      |   55 +-
 libs/parAlmond/parAlmondFormAggregates.cpp    |  106 +-
 libs/parAlmond/parAlmondGalerkinProd.cpp      |  116 +-
 libs/parAlmond/parAlmondKcycle.cpp            |  198 +-
 libs/parAlmond/parAlmondKernels.cpp           |  115 +-
 libs/parAlmond/parAlmondMultigrid.cpp         |   81 +-
 libs/parAlmond/parAlmondSettings.cpp          |   10 +-
 libs/parAlmond/parAlmondSmoothPrologator.cpp  |  204 +-
 libs/parAlmond/parAlmondSpMM.cpp              |  174 +-
 libs/parAlmond/parAlmondStrongGraph.cpp       |  164 +-
 .../parAlmondTentativeProlongator.cpp         |   61 +-
 libs/parAlmond/parAlmondTranspose.cpp         |   88 +-
 libs/parAlmond/parAlmondVcycle.cpp            |   28 +-
 libs/parAlmond/parAlmondparCSR.cpp            |  231 +-
 libs/timeStepper/okl/timeStepperAB.okl        |    2 +-
 libs/timeStepper/okl/timeStepperDOPRI5.okl    |   11 +-
 libs/timeStepper/okl/timeStepperEXTBDF.okl    |    2 +-
 libs/timeStepper/okl/timeStepperLSERK4.okl    |    2 +-
 libs/timeStepper/okl/timeStepperMRAB.okl      |   10 +-
 libs/timeStepper/okl/timeStepperMRSAAB.okl    |   10 +-
 libs/timeStepper/okl/timeStepperSAAB.okl      |    4 +-
 libs/timeStepper/okl/timeStepperSARK.okl      |   10 +-
 libs/timeStepper/okl/timeStepperSSBDF.okl     |    2 +-
 libs/timeStepper/timeStepper.cpp              |   59 +
 libs/timeStepper/timeStepperAB3.cpp           |   80 +-
 libs/timeStepper/timeStepperDOPRI5.cpp        |  180 +-
 libs/timeStepper/timeStepperEXTBDF3.cpp       |   77 +-
 libs/timeStepper/timeStepperLSERK4.cpp        |   89 +-
 libs/timeStepper/timeStepperMRAB3.cpp         |  139 +-
 libs/timeStepper/timeStepperMRSAAB3.cpp       |  184 +-
 libs/timeStepper/timeStepperSAAB3.cpp         |  118 +-
 libs/timeStepper/timeStepperSARK4.cpp         |  233 +-
 libs/timeStepper/timeStepperSARK5.cpp         |  234 +-
 libs/timeStepper/timeStepperSSBDF3.cpp        |   63 +-
 make.top                                      |   66 +-
 makefile                                      |    2 +-
 occa                                          |    2 +-
 solvers/acoustics/acoustics.hpp               |   45 +-
 solvers/acoustics/acousticsMain.cpp           |   49 +-
 solvers/acoustics/data/acousticsGaussian2D.h  |    2 +-
 solvers/acoustics/data/acousticsGaussian3D.h  |    4 +-
 solvers/acoustics/makefile                    |   19 +-
 .../okl/acousticsInitialCondition2D.okl       |    2 +-
 .../okl/acousticsInitialCondition3D.okl       |    2 +-
 .../acoustics/okl/acousticsSurfaceHex3D.okl   |   46 +-
 .../acoustics/okl/acousticsSurfaceQuad2D.okl  |   41 +-
 .../acoustics/okl/acousticsSurfaceTet3D.okl   |   33 +-
 .../acoustics/okl/acousticsSurfaceTri2D.okl   |   29 +-
 .../acoustics/okl/acousticsVolumeHex3D.okl    |    4 +-
 .../acoustics/okl/acousticsVolumeQuad2D.okl   |    4 +-
 .../acoustics/okl/acousticsVolumeTet3D.okl    |   10 +-
 .../acoustics/okl/acousticsVolumeTri2D.okl    |    4 +-
 solvers/acoustics/src/acousticsPlotFields.cpp |   46 +-
 solvers/acoustics/src/acousticsReport.cpp     |    8 +-
 solvers/acoustics/src/acousticsRun.cpp        |    8 +-
 solvers/acoustics/src/acousticsSettings.cpp   |   23 +-
 solvers/acoustics/src/acousticsSetup.cpp      |  128 +-
 solvers/acoustics/src/acousticsStep.cpp       |   51 +-
 solvers/advection/advection.hpp               |   47 +-
 solvers/advection/advectionMain.cpp           |   47 +-
 solvers/advection/data/advectionLinear2D.h    |    2 +-
 solvers/advection/data/advectionLinear3D.h    |    2 +-
 solvers/advection/makefile                    |   17 +-
 .../okl/advectionInitialCondition2D.okl       |    2 +-
 .../okl/advectionInitialCondition3D.okl       |    2 +-
 .../okl/advectionMaxWaveSpeedHex3D.okl        |    6 +-
 .../okl/advectionMaxWaveSpeedQuad2D.okl       |    6 +-
 .../okl/advectionMaxWaveSpeedTet3D.okl        |    4 +-
 .../okl/advectionMaxWaveSpeedTri2D.okl        |    4 +-
 .../advection/okl/advectionSurfaceHex3D.okl   |    7 +-
 .../advection/okl/advectionSurfaceQuad2D.okl  |    8 +-
 .../advection/okl/advectionSurfaceTet3D.okl   |    5 +-
 .../advection/okl/advectionSurfaceTri2D.okl   |    5 +-
 .../advection/okl/advectionVolumeHex3D.okl    |    4 +-
 .../advection/okl/advectionVolumeQuad2D.okl   |    6 +-
 .../advection/okl/advectionVolumeTet3D.okl    |    4 +-
 .../advection/okl/advectionVolumeTri2D.okl    |    4 +-
 solvers/advection/src/advectionPlotFields.cpp |   40 +-
 solvers/advection/src/advectionReport.cpp     |    8 +-
 solvers/advection/src/advectionRun.cpp        |    8 +-
 solvers/advection/src/advectionSettings.cpp   |   23 +-
 solvers/advection/src/advectionSetup.cpp      |  127 +-
 solvers/advection/src/advectionStep.cpp       |   17 +-
 solvers/bns/bns.hpp                           |  119 +-
 solvers/bns/bnsMain.cpp                       |   49 +-
 solvers/bns/data/bnsGaussian2D.h              |    2 +-
 solvers/bns/data/bnsGaussian3D.h              |    2 +-
 solvers/bns/data/bnsUniform2D.h               |    2 +-
 solvers/bns/data/bnsUniform3D.h               |    2 +-
 solvers/bns/makefile                          |   17 +-
 solvers/bns/okl/bnsConstrainQuad3D.okl        |   44 +-
 solvers/bns/okl/bnsInitialCondition2D.okl     |    2 +-
 solvers/bns/okl/bnsInitialCondition3D.okl     |    2 +-
 solvers/bns/okl/bnsIsoSurface3D.okl           |    4 +-
 solvers/bns/okl/bnsRelaxationHex3D.okl        |   35 +-
 solvers/bns/okl/bnsRelaxationQuad2D.okl       |   16 +-
 solvers/bns/okl/bnsRelaxationQuad3D.okl       |  297 ++-
 solvers/bns/okl/bnsRelaxationTet3D.okl        |    7 +-
 solvers/bns/okl/bnsRelaxationTri2D.okl        |    6 +-
 solvers/bns/okl/bnsSurfaceHex3D.okl           |   18 +-
 solvers/bns/okl/bnsSurfaceQuad2D.okl          |   14 +-
 solvers/bns/okl/bnsSurfaceQuad3D.okl          |  205 +-
 solvers/bns/okl/bnsSurfaceTet3D.okl           |    6 +-
 solvers/bns/okl/bnsSurfaceTri2D.okl           |    6 +-
 solvers/bns/okl/bnsVolumeHex3D.okl            |    5 +-
 solvers/bns/okl/bnsVolumeQuad2D.okl           |    5 +-
 solvers/bns/okl/bnsVolumeQuad3D.okl           |  337 ++-
 solvers/bns/okl/bnsVolumeTet3D.okl            |    5 +-
 solvers/bns/okl/bnsVolumeTri2D.okl            |    7 +-
 solvers/bns/okl/bnsVorticityHex3D.okl         |    5 +-
 solvers/bns/okl/bnsVorticityQuad2D.okl        |    5 +-
 solvers/bns/okl/bnsVorticityQuad3D.okl        |    3 +-
 solvers/bns/okl/bnsVorticityTet3D.okl         |    3 +-
 solvers/bns/okl/bnsVorticityTri2D.okl         |    3 +-
 solvers/bns/src/bnsPlotFields.cpp             |   57 +-
 solvers/bns/src/bnsPmlSetup.cpp               |   22 +-
 solvers/bns/src/bnsReport.cpp                 |    8 +-
 solvers/bns/src/bnsRun.cpp                    |   10 +-
 solvers/bns/src/bnsSettings.cpp               |   23 +-
 solvers/bns/src/bnsSetup.cpp                  |  248 +--
 solvers/bns/src/bnsStep.cpp                   |   64 +-
 solvers/cns/cns.hpp                           |   71 +-
 solvers/cns/cnsMain.cpp                       |   49 +-
 solvers/cns/data/cnsGaussian2D.h              |    2 +-
 solvers/cns/data/cnsGaussian3D.h              |    2 +-
 solvers/cns/data/cnsUniform2D.h               |    2 +-
 solvers/cns/data/cnsUniform3D.h               |    2 +-
 solvers/cns/data/cnsVortexDipole2D.h          |    2 +-
 solvers/cns/makefile                          |   17 +-
 solvers/cns/okl/cnsConstrainQuad3D.okl        |   44 +-
 solvers/cns/okl/cnsCubatureSurfaceHex3D.okl   |   24 +-
 solvers/cns/okl/cnsCubatureSurfaceQuad2D.okl  |   12 +-
 solvers/cns/okl/cnsCubatureSurfaceQuad3D.okl  |  758 ++++---
 solvers/cns/okl/cnsCubatureSurfaceTet3D.okl   |    8 +-
 solvers/cns/okl/cnsCubatureSurfaceTri2D.okl   |    4 +-
 solvers/cns/okl/cnsCubatureVolumeHex3D.okl    |   14 +-
 solvers/cns/okl/cnsCubatureVolumeQuad2D.okl   |    9 +-
 solvers/cns/okl/cnsCubatureVolumeQuad3D.okl   |  244 +--
 solvers/cns/okl/cnsCubatureVolumeTet3D.okl    |    4 +-
 solvers/cns/okl/cnsCubatureVolumeTri2D.okl    |    4 +-
 solvers/cns/okl/cnsGradSurfaceHex3D.okl       |    4 +-
 solvers/cns/okl/cnsGradSurfaceQuad2D.okl      |    5 +-
 solvers/cns/okl/cnsGradSurfaceTet3D.okl       |    3 +-
 solvers/cns/okl/cnsGradSurfaceTri2D.okl       |    3 +-
 solvers/cns/okl/cnsGradVolumeHex3D.okl        |    3 +-
 solvers/cns/okl/cnsGradVolumeQuad2D.okl       |    3 +-
 solvers/cns/okl/cnsGradVolumeTet3D.okl        |    3 +-
 solvers/cns/okl/cnsGradVolumeTri2D.okl        |    3 +-
 solvers/cns/okl/cnsInitialCondition2D.okl     |    2 +-
 solvers/cns/okl/cnsInitialCondition3D.okl     |    2 +-
 .../okl/cnsIsothermalCubatureSurfaceHex3D.okl |   23 +-
 .../cnsIsothermalCubatureSurfaceQuad2D.okl    |    7 +-
 .../okl/cnsIsothermalCubatureSurfaceTet3D.okl |    8 +-
 .../okl/cnsIsothermalCubatureSurfaceTri2D.okl |    4 +-
 .../okl/cnsIsothermalCubatureVolumeHex3D.okl  |   14 +-
 .../okl/cnsIsothermalCubatureVolumeQuad2D.okl |    9 +-
 .../okl/cnsIsothermalCubatureVolumeTet3D.okl  |    4 +-
 .../okl/cnsIsothermalCubatureVolumeTri2D.okl  |    4 +-
 solvers/cns/okl/cnsIsothermalSurfaceHex3D.okl |    7 +-
 .../cns/okl/cnsIsothermalSurfaceQuad2D.okl    |    5 +-
 solvers/cns/okl/cnsIsothermalSurfaceTet3D.okl |    3 +-
 solvers/cns/okl/cnsIsothermalSurfaceTri2D.okl |    3 +-
 solvers/cns/okl/cnsIsothermalVolumeHex3D.okl  |    3 +-
 solvers/cns/okl/cnsIsothermalVolumeQuad2D.okl |    3 +-
 solvers/cns/okl/cnsIsothermalVolumeTet3D.okl  |    3 +-
 solvers/cns/okl/cnsIsothermalVolumeTri2D.okl  |    3 +-
 solvers/cns/okl/cnsMaxWaveSpeedHex3D.okl      |    2 +-
 solvers/cns/okl/cnsMaxWaveSpeedQuad2D.okl     |    4 +-
 solvers/cns/okl/cnsMaxWaveSpeedTet3D.okl      |    4 +-
 solvers/cns/okl/cnsMaxWaveSpeedTri2D.okl      |    4 +-
 solvers/cns/okl/cnsSurfaceHex3D.okl           |    7 +-
 solvers/cns/okl/cnsSurfaceQuad2D.okl          |    5 +-
 solvers/cns/okl/cnsSurfaceQuad3D.okl          |  370 ++--
 solvers/cns/okl/cnsSurfaceTet3D.okl           |    3 +-
 solvers/cns/okl/cnsSurfaceTri2D.okl           |    3 +-
 solvers/cns/okl/cnsVolumeHex3D.okl            |    3 +-
 solvers/cns/okl/cnsVolumeQuad2D.okl           |    5 +-
 solvers/cns/okl/cnsVolumeQuad3D.okl           |  194 +-
 solvers/cns/okl/cnsVolumeTet3D.okl            |    3 +-
 solvers/cns/okl/cnsVolumeTri2D.okl            |    3 +-
 solvers/cns/okl/cnsVorticityHex3D.okl         |  137 +-
 solvers/cns/okl/cnsVorticityQuad2D.okl        |    5 +-
 solvers/cns/okl/cnsVorticityQuad3D.okl        |   65 +-
 solvers/cns/okl/cnsVorticityTet3D.okl         |    3 +-
 solvers/cns/okl/cnsVorticityTri2D.okl         |    3 +-
 solvers/cns/src/cnsPlotFields.cpp             |   59 +-
 solvers/cns/src/cnsReport.cpp                 |    8 +-
 solvers/cns/src/cnsRun.cpp                    |   10 +-
 solvers/cns/src/cnsSettings.cpp               |   23 +-
 solvers/cns/src/cnsSetup.cpp                  |  253 +--
 solvers/cns/src/cnsStep.cpp                   |   19 +-
 solvers/elliptic/data/ellipticBoundary2D.h    |    2 +-
 solvers/elliptic/data/ellipticBoundary3D.h    |    2 +-
 solvers/elliptic/data/ellipticHomogeneous2D.h |    2 +-
 solvers/elliptic/data/ellipticHomogeneous3D.h |    2 +-
 solvers/elliptic/data/ellipticSine2D.h        |    2 +-
 solvers/elliptic/data/ellipticSine3D.h        |    2 +-
 solvers/elliptic/elliptic.hpp                 |  115 +-
 solvers/elliptic/ellipticMain.cpp             |   66 +-
 solvers/elliptic/ellipticPrecon.hpp           |  214 +-
 solvers/elliptic/makefile                     |   17 +-
 solvers/elliptic/okl/ellipticAddBCHex3D.okl   |    2 +-
 solvers/elliptic/okl/ellipticAddBCQuad2D.okl  |    4 +-
 solvers/elliptic/okl/ellipticAddBCQuad3D.okl  |    2 +-
 solvers/elliptic/okl/ellipticAddBCTet3D.okl   |    2 +-
 solvers/elliptic/okl/ellipticAddBCTri2D.okl   |    2 +-
 solvers/elliptic/okl/ellipticAxHex3D.okl      |   38 +-
 solvers/elliptic/okl/ellipticAxIpdgHex3D.okl  |   20 +-
 solvers/elliptic/okl/ellipticAxIpdgQuad2D.okl |   10 +-
 solvers/elliptic/okl/ellipticAxIpdgQuad3D.okl |   10 +-
 solvers/elliptic/okl/ellipticAxIpdgTet3D.okl  |   10 +-
 solvers/elliptic/okl/ellipticAxIpdgTri2D.okl  |   10 +-
 solvers/elliptic/okl/ellipticAxIpdgTri3D.okl  |  568 +++--
 solvers/elliptic/okl/ellipticAxQuad2D.okl     |   18 +-
 solvers/elliptic/okl/ellipticAxQuad3D.okl     |   68 +-
 solvers/elliptic/okl/ellipticAxTet3D.okl      |   16 +-
 solvers/elliptic/okl/ellipticAxTri2D.okl      |   10 +-
 solvers/elliptic/okl/ellipticAxTri3D.okl      |   10 +-
 .../elliptic/okl/ellipticCubatureAxHex3D.okl  |  406 ++--
 .../elliptic/okl/ellipticGradientHex3D.okl    |    4 +-
 .../elliptic/okl/ellipticGradientQuad2D.okl   |    4 +-
 .../elliptic/okl/ellipticGradientQuad3D.okl   |    4 +-
 .../elliptic/okl/ellipticGradientTet3D.okl    |    4 +-
 .../elliptic/okl/ellipticGradientTri2D.okl    |    5 +-
 .../elliptic/okl/ellipticGradientTri3D.okl    |   46 +-
 solvers/elliptic/okl/ellipticMask.okl         |    2 +-
 solvers/elliptic/okl/ellipticPatchSolver.okl  |    3 +-
 .../okl/ellipticPreconBlockJacobi.okl         |    6 +-
 .../okl/ellipticPreconCoarsenHex3D.okl        |   14 +-
 .../okl/ellipticPreconCoarsenQuad2D.okl       |    8 +-
 .../okl/ellipticPreconCoarsenTet3D.okl        |    4 +-
 .../okl/ellipticPreconCoarsenTri2D.okl        |    5 +-
 .../okl/ellipticPreconProlongateHex3D.okl     |   14 +-
 .../okl/ellipticPreconProlongateQuad2D.okl    |    8 +-
 .../okl/ellipticPreconProlongateTet3D.okl     |    4 +-
 .../okl/ellipticPreconProlongateTri2D.okl     |    5 +-
 solvers/elliptic/okl/ellipticRhsBCHex3D.okl   |   14 +-
 .../elliptic/okl/ellipticRhsBCIpdgHex3D.okl   |   11 +-
 .../elliptic/okl/ellipticRhsBCIpdgQuad2D.okl  |    6 +-
 .../elliptic/okl/ellipticRhsBCIpdgTet3D.okl   |    6 +-
 .../elliptic/okl/ellipticRhsBCIpdgTri2D.okl   |    6 +-
 solvers/elliptic/okl/ellipticRhsBCQuad2D.okl  |   13 +-
 solvers/elliptic/okl/ellipticRhsBCQuad3D.okl  |   23 +-
 solvers/elliptic/okl/ellipticRhsBCTet3D.okl   |    7 +-
 solvers/elliptic/okl/ellipticRhsBCTri2D.okl   |    7 +-
 solvers/elliptic/okl/ellipticRhsHex3D.okl     |    7 +-
 solvers/elliptic/okl/ellipticRhsQuad2D.okl    |    7 +-
 solvers/elliptic/okl/ellipticRhsQuad3D.okl    |    7 +-
 solvers/elliptic/okl/ellipticRhsTet3D.okl     |    8 +-
 solvers/elliptic/okl/ellipticRhsTri2D.okl     |    8 +-
 solvers/elliptic/okl/ellipticSEMFEMAnterp.okl |    3 +-
 solvers/elliptic/okl/ellipticSEMFEMInterp.okl |    3 +-
 .../elliptic/src/ellipticBoundarySetup.cpp    |  142 +-
 .../src/ellipticBuildOperatorDiagonal.cpp     |  349 ++-
 .../ellipticBuildOperatorMatrixContinuous.cpp |  439 ++--
 .../src/ellipticBuildOperatorMatrixIpdg.cpp   |  701 +++---
 solvers/elliptic/src/ellipticOperator.cpp     |   43 +-
 solvers/elliptic/src/ellipticPlotFields.cpp   |   40 +-
 solvers/elliptic/src/ellipticPreconJacobi.cpp |   19 +-
 .../elliptic/src/ellipticPreconMassMatrix.cpp |   60 +-
 .../elliptic/src/ellipticPreconMultiGrid.cpp  |   76 +-
 .../src/ellipticPreconMultiGridLevel.cpp      |  286 ++-
 solvers/elliptic/src/ellipticPreconOAS.cpp    |  138 +-
 .../elliptic/src/ellipticPreconParAlmond.cpp  |   20 +-
 solvers/elliptic/src/ellipticPreconSEMFEM.cpp |  199 +-
 solvers/elliptic/src/ellipticRun.cpp          |  135 +-
 solvers/elliptic/src/ellipticSettings.cpp     |   23 +-
 solvers/elliptic/src/ellipticSetup.cpp        |  155 +-
 .../elliptic/src/ellipticSetupNewDegree.cpp   |  108 +-
 .../elliptic/src/ellipticSetupRingPatch.cpp   |   46 +-
 solvers/elliptic/src/ellipticSolve.cpp        |    6 +-
 solvers/elliptic/src/ellipticZeroMean.cpp     |    9 +-
 solvers/fokkerPlanck/data/fpeLinear2D.h       |    4 +-
 solvers/fokkerPlanck/data/fpeLinear3D.h       |    4 +-
 solvers/fokkerPlanck/fpe.hpp                  |  100 +-
 solvers/fokkerPlanck/fpeMain.cpp              |   49 +-
 solvers/fokkerPlanck/makefile                 |   23 +-
 .../fokkerPlanck/okl/fpeAdvectionHex3D.okl    |   21 +-
 .../fokkerPlanck/okl/fpeAdvectionQuad2D.okl   |    6 +-
 .../fokkerPlanck/okl/fpeAdvectionTet3D.okl    |    4 +-
 .../fokkerPlanck/okl/fpeAdvectionTri2D.okl    |    4 +-
 .../okl/fpeCubatureAdvectionHex3D.okl         |   35 +-
 .../okl/fpeCubatureAdvectionQuad2D.okl        |   13 +-
 .../okl/fpeCubatureAdvectionTet3D.okl         |    6 +-
 .../okl/fpeCubatureAdvectionTri2D.okl         |    6 +-
 .../fokkerPlanck/okl/fpeDiffusionHex3D.okl    |   11 +-
 .../fokkerPlanck/okl/fpeDiffusionQuad2D.okl   |   12 +-
 .../fokkerPlanck/okl/fpeDiffusionRhsHex3D.okl |   32 +-
 .../okl/fpeDiffusionRhsQuad2D.okl             |   13 +-
 .../fokkerPlanck/okl/fpeDiffusionRhsTet3D.okl |    8 +-
 .../fokkerPlanck/okl/fpeDiffusionRhsTri2D.okl |    8 +-
 .../fokkerPlanck/okl/fpeDiffusionTet3D.okl    |    5 +-
 .../fokkerPlanck/okl/fpeDiffusionTri2D.okl    |    5 +-
 solvers/fokkerPlanck/okl/fpeGradientHex3D.okl |    5 +-
 .../fokkerPlanck/okl/fpeGradientQuad2D.okl    |    5 +-
 solvers/fokkerPlanck/okl/fpeGradientTet3D.okl |    5 +-
 solvers/fokkerPlanck/okl/fpeGradientTri2D.okl |    3 +-
 .../okl/fpeInitialCondition2D.okl             |    2 +-
 .../okl/fpeInitialCondition3D.okl             |    2 +-
 .../fokkerPlanck/okl/fpeMaxWaveSpeedHex3D.okl |    2 +-
 .../okl/fpeMaxWaveSpeedQuad2D.okl             |    2 +-
 .../fokkerPlanck/okl/fpeMaxWaveSpeedTet3D.okl |    2 +-
 .../fokkerPlanck/okl/fpeMaxWaveSpeedTri2D.okl |    2 +-
 solvers/fokkerPlanck/src/fpePlotFields.cpp    |   40 +-
 solvers/fokkerPlanck/src/fpeReport.cpp        |    8 +-
 solvers/fokkerPlanck/src/fpeRun.cpp           |   12 +-
 solvers/fokkerPlanck/src/fpeSettings.cpp      |   37 +-
 solvers/fokkerPlanck/src/fpeSetup.cpp         |  277 +--
 solvers/fokkerPlanck/src/fpeStep.cpp          |   49 +-
 solvers/fokkerPlanck/src/fpeSubcycle.cpp      |   17 +-
 solvers/gradient/data/gradientCos2D.h         |    2 +-
 solvers/gradient/data/gradientCos3D.h         |    2 +-
 solvers/gradient/gradient.hpp                 |   37 +-
 solvers/gradient/gradientMain.cpp             |   49 +-
 solvers/gradient/makefile                     |   15 +-
 .../okl/gradientInitialCondition2D.okl        |    2 +-
 .../okl/gradientInitialCondition3D.okl        |    2 +-
 solvers/gradient/okl/gradientVolumeHex3D.okl  |   20 +-
 solvers/gradient/okl/gradientVolumeQuad2D.okl |    4 +-
 solvers/gradient/okl/gradientVolumeTet3D.okl  |  381 ++--
 solvers/gradient/okl/gradientVolumeTri2D.okl  |  295 ++-
 solvers/gradient/src/gradientPlotFields.cpp   |   44 +-
 solvers/gradient/src/gradientReport.cpp       |    4 +-
 solvers/gradient/src/gradientRun.cpp          |    4 +-
 solvers/gradient/src/gradientSettings.cpp     |   25 +-
 solvers/gradient/src/gradientSetup.cpp        |   82 +-
 solvers/ins/data/insBeltrami3D.h              |    2 +-
 solvers/ins/data/insUniform2D.h               |    2 +-
 solvers/ins/data/insUniform3D.h               |    2 +-
 solvers/ins/data/insVortex2D.h                |    2 +-
 solvers/ins/ins.hpp                           |  174 +-
 solvers/ins/insMain.cpp                       |   49 +-
 solvers/ins/makefile                          |   23 +-
 solvers/ins/okl/insAdvectionHex3D.okl         |   11 +-
 solvers/ins/okl/insAdvectionQuad2D.okl        |    6 +-
 solvers/ins/okl/insAdvectionQuad3D.okl        |   18 +-
 solvers/ins/okl/insAdvectionTet3D.okl         |    4 +-
 solvers/ins/okl/insAdvectionTri2D.okl         |    4 +-
 solvers/ins/okl/insConstrainQuad3D.okl        |    2 +-
 solvers/ins/okl/insCubatureAdvectionHex3D.okl |   53 +-
 .../ins/okl/insCubatureAdvectionQuad2D.okl    |   14 +-
 solvers/ins/okl/insCubatureAdvectionTet3D.okl |    6 +-
 solvers/ins/okl/insCubatureAdvectionTri2D.okl |    6 +-
 solvers/ins/okl/insDiffusionHex3D.okl         |   12 +-
 solvers/ins/okl/insDiffusionQuad2D.okl        |    7 +-
 solvers/ins/okl/insDiffusionQuad3D.okl        |   75 +-
 solvers/ins/okl/insDiffusionTet3D.okl         |    6 +-
 solvers/ins/okl/insDiffusionTri2D.okl         |    6 +-
 solvers/ins/okl/insDivergenceHex3D.okl        |   12 +-
 solvers/ins/okl/insDivergenceQuad2D.okl       |    6 +-
 solvers/ins/okl/insDivergenceQuad3D.okl       |  110 +-
 solvers/ins/okl/insDivergenceTet3D.okl        |    4 +-
 solvers/ins/okl/insDivergenceTri2D.okl        |    4 +-
 solvers/ins/okl/insGradientHex3D.okl          |   10 +-
 solvers/ins/okl/insGradientQuad2D.okl         |    6 +-
 solvers/ins/okl/insGradientQuad3D.okl         |  188 +-
 solvers/ins/okl/insGradientTet3D.okl          |    4 +-
 solvers/ins/okl/insGradientTri2D.okl          |    7 +-
 solvers/ins/okl/insInitialCondition2D.okl     |    2 +-
 solvers/ins/okl/insInitialCondition3D.okl     |    2 +-
 solvers/ins/okl/insMaxWaveSpeedHex3D.okl      |    2 +-
 solvers/ins/okl/insMaxWaveSpeedQuad2D.okl     |    2 +-
 solvers/ins/okl/insMaxWaveSpeedTet3D.okl      |    2 +-
 solvers/ins/okl/insMaxWaveSpeedTri2D.okl      |    2 +-
 .../ins/okl/insPressureIncrementRhsHex3D.okl  |   23 +-
 .../ins/okl/insPressureIncrementRhsQuad2D.okl |   15 +-
 .../ins/okl/insPressureIncrementRhsTet3D.okl  |   18 +-
 .../ins/okl/insPressureIncrementRhsTri2D.okl  |   18 +-
 solvers/ins/okl/insPressureRhsHex3D.okl       |   32 +-
 solvers/ins/okl/insPressureRhsQuad2D.okl      |   22 +-
 solvers/ins/okl/insPressureRhsQuad3D.okl      |   13 +-
 solvers/ins/okl/insPressureRhsTet3D.okl       |   19 +-
 solvers/ins/okl/insPressureRhsTri2D.okl       |   19 +-
 solvers/ins/okl/insSubcycleAdvection.okl      |    2 +-
 solvers/ins/okl/insSubcycleAdvectionHex3D.okl |   11 +-
 .../ins/okl/insSubcycleAdvectionQuad2D.okl    |    6 +-
 .../ins/okl/insSubcycleAdvectionQuad3D.okl    |   22 +-
 solvers/ins/okl/insSubcycleAdvectionTet3D.okl |    4 +-
 solvers/ins/okl/insSubcycleAdvectionTri2D.okl |    4 +-
 .../okl/insSubcycleCubatureAdvectionHex3D.okl |   67 +-
 .../insSubcycleCubatureAdvectionQuad2D.okl    |   14 +-
 .../okl/insSubcycleCubatureAdvectionTet3D.okl |    6 +-
 .../okl/insSubcycleCubatureAdvectionTri2D.okl |   13 +-
 solvers/ins/okl/insVelocityGradientHex3D.okl  |    3 +-
 solvers/ins/okl/insVelocityGradientQuad2D.okl |    5 +-
 solvers/ins/okl/insVelocityGradientQuad3D.okl |    3 +-
 solvers/ins/okl/insVelocityGradientTet3D.okl  |    3 +-
 solvers/ins/okl/insVelocityGradientTri2D.okl  |    5 +-
 solvers/ins/okl/insVelocityRhsHex3D.okl       |   39 +-
 solvers/ins/okl/insVelocityRhsQuad2D.okl      |   28 +-
 solvers/ins/okl/insVelocityRhsQuad3D.okl      |   66 +-
 solvers/ins/okl/insVelocityRhsTet3D.okl       |   19 +-
 solvers/ins/okl/insVelocityRhsTri2D.okl       |   19 +-
 solvers/ins/okl/insVorticityHex3D.okl         |    6 +-
 solvers/ins/okl/insVorticityQuad2D.okl        |    5 +-
 solvers/ins/okl/insVorticityQuad3D.okl        |    3 +-
 solvers/ins/okl/insVorticityTet3D.okl         |    4 +-
 solvers/ins/okl/insVorticityTri2D.okl         |    3 +-
 solvers/ins/src/insAdvection.cpp              |   10 +-
 solvers/ins/src/insDiffusion.cpp              |   12 +-
 solvers/ins/src/insDivergence.cpp             |   10 +-
 solvers/ins/src/insGradient.cpp               |   10 +-
 solvers/ins/src/insPlotFields.cpp             |   52 +-
 solvers/ins/src/insPressureIncrementSolve.cpp |   17 +-
 solvers/ins/src/insPressureSolve.cpp          |   17 +-
 solvers/ins/src/insReport.cpp                 |    8 +-
 solvers/ins/src/insRun.cpp                    |   12 +-
 solvers/ins/src/insSettings.cpp               |   57 +-
 solvers/ins/src/insSetup.cpp                  |  634 +++---
 solvers/ins/src/insStep.cpp                   |   44 +-
 solvers/ins/src/insSubcycle.cpp               |   23 +-
 solvers/ins/src/insVelocitySolve.cpp          |   39 +-
 solvers/lbs/data/lbsGaussian2D.h              |    2 +-
 solvers/lbs/data/lbsGaussian3D.h              |    2 +-
 solvers/lbs/data/lbsUniform2D.h               |    2 +-
 solvers/lbs/data/lbsUniform3D.h               |    2 +-
 solvers/lbs/lbs.hpp                           |   96 +-
 solvers/lbs/lbsMain.cpp                       |   49 +-
 solvers/lbs/makefile                          |   25 +-
 solvers/lbs/okl/lbsCollisionHex3D.okl         |  320 +--
 solvers/lbs/okl/lbsCollisionQuad2D.okl        |  356 +--
 solvers/lbs/okl/lbsCollisionTet3D.okl         |  240 +-
 solvers/lbs/okl/lbsCollisionTri2D.okl         |  210 +-
 solvers/lbs/okl/lbsInitialCondition2D.okl     |    2 +-
 solvers/lbs/okl/lbsInitialCondition3D.okl     |    2 +-
 solvers/lbs/okl/lbsSurfaceHex3D.okl           |   42 +-
 solvers/lbs/okl/lbsSurfaceQuad2D.okl          |   95 +-
 solvers/lbs/okl/lbsSurfaceTet3D.okl           |  186 +-
 solvers/lbs/okl/lbsSurfaceTri2D.okl           |    6 +-
 solvers/lbs/okl/lbsVolumeHex3D.okl            |   68 +-
 solvers/lbs/okl/lbsVolumeQuad2D.okl           |   98 +-
 solvers/lbs/okl/lbsVolumeTet3D.okl            |   98 +-
 solvers/lbs/okl/lbsVolumeTri2D.okl            |    6 +-
 solvers/lbs/okl/lbsVorticityHex3D.okl         |    6 +-
 solvers/lbs/okl/lbsVorticityQuad2D.okl        |    6 +-
 solvers/lbs/okl/lbsVorticityTet3D.okl         |    4 +-
 solvers/lbs/okl/lbsVorticityTri2D.okl         |    4 +-
 solvers/lbs/src/lbsLatticeSetup.cpp           |   20 +-
 solvers/lbs/src/lbsPlotFields.cpp             |   52 +-
 solvers/lbs/src/lbsPmlSetup.cpp               |   22 +-
 solvers/lbs/src/lbsReport.cpp                 |    8 +-
 solvers/lbs/src/lbsRun.cpp                    |   12 +-
 solvers/lbs/src/lbsSettings.cpp               |   21 +-
 solvers/lbs/src/lbsSetup.cpp                  |  185 +-
 solvers/lbs/src/lbsStep.cpp                   |   62 +-
 test/makefile                                 |    2 +-
 test/test.py                                  |   22 +-
 test/testAcoustics.py                         |    2 +-
 test/testAdvection.py                         |    2 +-
 test/testBns.py                               |    2 +-
 test/testCns.py                               |    2 +-
 test/testElliptic.py                          |    6 +-
 test/testFokkerPlanck.py                      |    2 +-
 test/testGradient.py                          |    6 +-
 test/testInitialGuess.py                      |    2 +-
 test/testIns.py                               |    2 +-
 test/testLbs.py                               |    2 +-
 test/testLinearSolver.py                      |    2 +-
 test/testMesh.py                              |    4 +-
 test/testParAdogs.py                          |   96 +
 test/testParAlmond.py                         |    4 +-
 test/testTimeStepper.py                       |    4 +-
 788 files changed, 30979 insertions(+), 38945 deletions(-)
 create mode 100644 .github/CodeDiagram.png
 delete mode 100644 3rdParty/gslib/.travis.yml
 delete mode 100644 3rdParty/gslib/LICENSE
 delete mode 100644 3rdParty/gslib/Makefile
 delete mode 100644 3rdParty/gslib/README.md
 delete mode 100644 3rdParty/gslib/RELEASE.md
 delete mode 100755 3rdParty/gslib/cdep.py
 delete mode 100644 3rdParty/gslib/makefile.cdep
 delete mode 100755 3rdParty/gslib/odep_info.py
 delete mode 100644 3rdParty/gslib/src/c99.h
 delete mode 100644 3rdParty/gslib/src/comm.c
 delete mode 100644 3rdParty/gslib/src/comm.h
 delete mode 100644 3rdParty/gslib/src/crs.h
 delete mode 100644 3rdParty/gslib/src/crystal.c
 delete mode 100644 3rdParty/gslib/src/crystal.h
 delete mode 100644 3rdParty/gslib/src/fail.c
 delete mode 100644 3rdParty/gslib/src/fail.h
 delete mode 100644 3rdParty/gslib/src/fcrystal.c
 delete mode 100644 3rdParty/gslib/src/findpts.c
 delete mode 100644 3rdParty/gslib/src/findpts.h
 delete mode 100644 3rdParty/gslib/src/findpts_el.h
 delete mode 100644 3rdParty/gslib/src/findpts_el_2.c
 delete mode 100644 3rdParty/gslib/src/findpts_el_3.c
 delete mode 100644 3rdParty/gslib/src/findpts_imp.h
 delete mode 100644 3rdParty/gslib/src/findpts_local.c
 delete mode 100644 3rdParty/gslib/src/findpts_local.h
 delete mode 100644 3rdParty/gslib/src/findpts_local_imp.h
 delete mode 100644 3rdParty/gslib/src/gen_poly_imp.c
 delete mode 100644 3rdParty/gslib/src/gs.c
 delete mode 100644 3rdParty/gslib/src/gs.h
 delete mode 100644 3rdParty/gslib/src/gs_defs.h
 delete mode 100644 3rdParty/gslib/src/gs_local.c
 delete mode 100644 3rdParty/gslib/src/gs_local.h
 delete mode 100644 3rdParty/gslib/src/gslib.h
 delete mode 100644 3rdParty/gslib/src/lob_bnd.c
 delete mode 100644 3rdParty/gslib/src/lob_bnd.h
 delete mode 100644 3rdParty/gslib/src/mem.h
 delete mode 100644 3rdParty/gslib/src/name.h
 delete mode 100644 3rdParty/gslib/src/obbox.c
 delete mode 100644 3rdParty/gslib/src/obbox.h
 delete mode 100644 3rdParty/gslib/src/poly.c
 delete mode 100644 3rdParty/gslib/src/poly.h
 delete mode 100644 3rdParty/gslib/src/poly_imp.h
 delete mode 100644 3rdParty/gslib/src/rand_elt_test.c
 delete mode 100644 3rdParty/gslib/src/rand_elt_test.h
 delete mode 100644 3rdParty/gslib/src/sarray_sort.c
 delete mode 100644 3rdParty/gslib/src/sarray_sort.h
 delete mode 100644 3rdParty/gslib/src/sarray_transfer.c
 delete mode 100644 3rdParty/gslib/src/sarray_transfer.h
 delete mode 100644 3rdParty/gslib/src/sort.c
 delete mode 100644 3rdParty/gslib/src/sort.h
 delete mode 100644 3rdParty/gslib/src/sort_imp.h
 delete mode 100644 3rdParty/gslib/src/tensor.c
 delete mode 100644 3rdParty/gslib/src/tensor.h
 delete mode 100644 3rdParty/gslib/src/types.h
 delete mode 100644 3rdParty/gslib/tests/comm_test.c
 delete mode 100644 3rdParty/gslib/tests/crystal_test.c
 delete mode 100644 3rdParty/gslib/tests/findpts_el_2_test.c
 delete mode 100644 3rdParty/gslib/tests/findpts_el_2_test2.c
 delete mode 100644 3rdParty/gslib/tests/findpts_el_3_test.c
 delete mode 100644 3rdParty/gslib/tests/findpts_el_3_test2.c
 delete mode 100644 3rdParty/gslib/tests/findpts_local_test.c
 delete mode 100644 3rdParty/gslib/tests/findpts_test.c
 delete mode 100644 3rdParty/gslib/tests/fortran/f-igs.f
 delete mode 100644 3rdParty/gslib/tests/gs_test.c
 delete mode 100644 3rdParty/gslib/tests/gs_test_gop_blocking.c
 delete mode 100644 3rdParty/gslib/tests/gs_test_gop_nonblocking.c
 delete mode 100644 3rdParty/gslib/tests/gs_test_old.c
 delete mode 100644 3rdParty/gslib/tests/gs_unique_test.c
 delete mode 100644 3rdParty/gslib/tests/lob_bnd_test.c
 delete mode 100644 3rdParty/gslib/tests/obbox_test.c
 delete mode 100644 3rdParty/gslib/tests/poly_test.c
 delete mode 100755 3rdParty/gslib/tests/run_tests.sh
 delete mode 100644 3rdParty/gslib/tests/sarray_sort_test.c
 delete mode 100644 3rdParty/gslib/tests/sarray_transfer_test.c
 delete mode 100644 3rdParty/gslib/tests/sort_test.c
 delete mode 100644 3rdParty/gslib/tests/sort_test2.c
 create mode 100644 include/comm.hpp
 create mode 100644 include/memory.hpp
 delete mode 100644 include/mesh/mesh2D.hpp
 delete mode 100644 include/mesh/mesh3D.hpp
 delete mode 100644 include/mesh/meshDefines3D.h
 mode change 100644 => 100755 include/ogs.hpp
 create mode 100644 include/ogs/ogsBase.hpp
 delete mode 100644 include/ogs/ogsDefs.h
 create mode 100644 include/ogs/ogsExchange.hpp
 delete mode 100644 include/ogs/ogsKernels.hpp
 create mode 100644 include/ogs/ogsOperator.hpp
 create mode 100644 include/ogs/ogsUtils.hpp
 rename libs/mesh/meshPlotInterpTri3D.cpp => include/operator.hpp (73%)
 rename libs/mesh/meshPartitionStatistics.cpp => include/parAdogs.hpp (52%)
 create mode 100644 include/parAdogs/parAdogsGraph.hpp
 create mode 100644 include/parAdogs/parAdogsMatrix.hpp
 create mode 100644 include/parAdogs/parAdogsMultigrid.hpp
 rename include/{mesh/meshDefines2D.h => parAdogs/parAdogsPartition.hpp} (67%)
 delete mode 100644 include/parAlmond/parAlmondMultigrid.hpp
 create mode 100644 include/timer.hpp
 create mode 100644 libs/core/comm.cpp
 create mode 100644 libs/core/exception.cpp
 delete mode 100644 libs/core/factor.cpp
 delete mode 100644 libs/core/matrixEig.cpp
 delete mode 100644 libs/core/matrixRightSolve.cpp
 rename libs/{mesh/meshOccaSetupQuad2D.cpp => core/memory.cpp} (65%)
 delete mode 100644 libs/core/parallelSort.cpp
 create mode 100644 libs/core/rankDecomp.cpp
 rename solvers/ins/src/insBoundarySetup.cpp => libs/core/timer.cpp (52%)
 rename libs/{core/matrixConditionNumber.cpp => linAlg/linAlgMatrixConditionNumber.cpp} (52%)
 create mode 100644 libs/linAlg/linAlgMatrixEig.cpp
 rename libs/{core/matrixInverse.cpp => linAlg/linAlgMatrixInverse.cpp} (56%)
 create mode 100644 libs/linAlg/linAlgMatrixRightSolve.cpp
 rename libs/{core/matrixTranspose.cpp => linAlg/linAlgMatrixTranspose.cpp} (61%)
 delete mode 100644 libs/mesh/mesh.cpp
 create mode 100644 libs/mesh/meshConnectFaceNodes.cpp
 delete mode 100644 libs/mesh/meshConnectFaceNodes2D.cpp
 delete mode 100644 libs/mesh/meshConnectFaceNodes3D.cpp
 create mode 100644 libs/mesh/meshConnectFaceVertices.cpp
 create mode 100644 libs/mesh/meshConnectNodes.cpp
 rename libs/mesh/{meshParallelGatherScatterSetup.cpp => meshGatherScatterSetup.cpp} (53%)
 delete mode 100644 libs/mesh/meshGeometricPartition2D.cpp
 delete mode 100644 libs/mesh/meshGeometricPartition3D.cpp
 delete mode 100644 libs/mesh/meshOccaSetup.cpp
 delete mode 100644 libs/mesh/meshOccaSetup2D.cpp
 delete mode 100644 libs/mesh/meshOccaSetup3D.cpp
 delete mode 100644 libs/mesh/meshOccaSetupHex3D.cpp
 delete mode 100644 libs/mesh/meshOccaSetupTet3D.cpp
 delete mode 100644 libs/mesh/meshOccaSetupTri2D.cpp
 delete mode 100644 libs/mesh/meshOccaSetupTri3D.cpp
 delete mode 100644 libs/mesh/meshParallelConnectNodes.cpp
 delete mode 100644 libs/mesh/meshParallelConnectOpt.cpp
 delete mode 100644 libs/mesh/meshParallelReaderQuad3D.cpp
 delete mode 100644 libs/mesh/meshParallelReaderTri3D.cpp
 create mode 100644 libs/mesh/meshPartition.cpp
 delete mode 100644 libs/mesh/meshPlotInterpQuad3D.cpp
 rename libs/mesh/{meshParallelReaderHex3D.cpp => meshReadGmshHex3D.cpp} (60%)
 rename libs/mesh/{meshParallelReaderQuad2D.cpp => meshReadGmshQuad2D.cpp} (60%)
 create mode 100644 libs/mesh/meshReadGmshQuad3D.cpp
 rename libs/mesh/{meshParallelReaderTet3D.cpp => meshReadGmshTet3D.cpp} (57%)
 rename libs/mesh/{meshParallelReaderTri2D.cpp => meshReadGmshTri2D.cpp} (60%)
 create mode 100644 libs/mesh/meshReadGmshTri3D.cpp
 create mode 100644 libs/mesh/meshSetElementType.cpp
 delete mode 100644 libs/ogs/gs.cpp
 delete mode 100644 libs/ogs/hostGather.cpp
 delete mode 100644 libs/ogs/hostGatherScatter.cpp
 delete mode 100644 libs/ogs/hostScatter.cpp
 delete mode 100644 libs/ogs/occaGather.cpp
 delete mode 100644 libs/ogs/occaGatherScatter.cpp
 delete mode 100644 libs/ogs/occaGatheredHaloExchange.cpp
 delete mode 100644 libs/ogs/occaScatter.cpp
 create mode 100644 libs/ogs/ogsAllToAll.cpp
 create mode 100644 libs/ogs/ogsAuto.cpp
 create mode 100644 libs/ogs/ogsCrystalRouter.cpp
 create mode 100644 libs/ogs/ogsHalo.cpp
 delete mode 100644 libs/ogs/ogsKernels.cpp
 create mode 100644 libs/ogs/ogsOperator.cpp
 create mode 100644 libs/ogs/ogsPairwise.cpp
 create mode 100644 libs/ogs/ogsUtils.cpp
 delete mode 100644 libs/ogs/okl/gatherScatter.okl
 create mode 100644 libs/ogs/okl/ogsKernels.okl
 create mode 100644 libs/parAdogs/parAdogsConnect.cpp
 create mode 100644 libs/parAdogs/parAdogsCuthillMckee.cpp
 create mode 100644 libs/parAdogs/parAdogsFiedlerVector.cpp
 create mode 100644 libs/parAdogs/parAdogsGraph.cpp
 create mode 100644 libs/parAdogs/parAdogsInertialBipartition.cpp
 create mode 100644 libs/parAdogs/parAdogsInertialPartition.cpp
 create mode 100644 libs/parAdogs/parAdogsMatrix.cpp
 create mode 100644 libs/parAdogs/parAdogsMeshPartition.cpp
 create mode 100644 libs/parAdogs/parAdogsMultigrid.cpp
 create mode 100644 libs/parAdogs/parAdogsMultigridAggregate.cpp
 create mode 100644 libs/parAdogs/parAdogsMultigridCoarseSolver.cpp
 create mode 100644 libs/parAdogs/parAdogsMultigridLaplacian.cpp
 create mode 100644 libs/parAdogs/parAdogsMultigridSetup.cpp
 create mode 100644 libs/parAdogs/parAdogsMultigridSmooth.cpp
 create mode 100644 libs/parAdogs/parAdogsMultigridSmoothPrologator.cpp
 create mode 100644 libs/parAdogs/parAdogsMultigridSpMM.cpp
 create mode 100644 libs/parAdogs/parAdogsMultigridTentativeProlongator.cpp
 create mode 100644 libs/parAdogs/parAdogsMultigridTranspose.cpp
 create mode 100644 libs/parAdogs/parAdogsParallelPivot.cpp
 create mode 100644 libs/parAdogs/parAdogsRefine.cpp
 rename libs/{mesh/meshOccaSetupQuad3D.cpp => parAdogs/parAdogsSettings.cpp} (65%)
 create mode 100644 libs/parAdogs/parAdogsSolve.cpp
 create mode 100644 libs/parAdogs/parAdogsSpectralBipartition.cpp
 create mode 100644 libs/parAdogs/parAdogsSpectralPartition.cpp
 create mode 100644 libs/timeStepper/timeStepper.cpp
 mode change 100644 => 100755 test/testLbs.py
 create mode 100755 test/testParAdogs.py

diff --git a/.github/CodeDiagram.png b/.github/CodeDiagram.png
new file mode 100644
index 0000000000000000000000000000000000000000..649e53acc60b74d298043b4a9d729dc10cdb3188
GIT binary patch
literal 43595
zcmeFZcT|&Iw=YUZkN_Hrh7xQP0f|baNC)Y?NRuX_5Tq&i3X;%4K)Mu>5{eKyC{3iJ
zpdd;Qf*6Vj1d!em%6$U(>ig~e?R)mPXPkTQ8Rrj%Le{gMHRqaZ&iVVzxe}_YtpcKl
z(36pofz(u$^vTF5G=U$BBQ(HY8h`Td0RNER*H=*>EB?ee1N=gDM_x;wjI1o`=(Y_t
z@H?HWs_A_)GR79t4|$sl<`x-Q-l&?Ayx~L3`Bd6!E~D4GWt6cB^w*BPCD5+X<BjfO
znbqm}c^+pTQB|qQv}C^PdTh?$ouN)6WHl}NdtjI2^trRy>^&V@yR##Yd?W+~aUav3
zU4E2~yxo9X*}nIw@QUN@Z>F;&jz8w^o@Ss4XtIW#cru8#xd}T#`VDhjRP*q!M*hK}
zq)!=Wvedg^7xzE5Hl%~_9zJwF<^!xV>7wF=hV6S_?xZ8Xze?XJT%G%3aOKM_u)>j3
zfziR_yWcOTZ)tugX{>5&xD{a->_AwZxwknMqEY2M@*N$(7~657&QLjmQ$R;{?div4
zi8yB9!E75}Sw9^=D~ZKMN{wA=85$I?)7dMUQ|UhSwR<}YiB>6Y!L>_-+1qWvOPCcP
ztec7Hd)rgzWbZdovm0`1zf`@I6e;oJ#1`JT;k<#l*+)jwtby)qqjw$<+l|$Cc^wSl
zn)Y*9&>hTc#r>6jqTlY+Q{ggllUh3Fn6!ApxoYIW<-bTDG537&`>eM7@Inu4EQnuf
zV!~FO^k){<&6JJ01*DM`Rg|da!*EWKhvpIcQnOIk9%00R*>h}~H&{k>7B;c2z#@+b
zNG@Joly<}Q-h%LYzXsQrG$Nkv3v1uF(YU7+<dC!&k+d1hS0KN?piFWc<sO(7m<0QH
z;`zlQ8ak4wVKm-M+d76x!E}2Y>W}u`#95_zRNj5n68U3qULlOOcn#PGC&VebK8^eG
zb<v`_B4<ISYZSj`Vk~NnrEqf|$V!~^?zY|{h@R3SD_+Zy_PTRiV6DgdduF1KjD4k6
z810%uPZxI`u&QfX4B)RLpZ30g$KI98Hg{^s$_o`&_sTFuNW%Q|M9vEI9aU4jUe`AQ
z%c%Dt%ig@z&A8T6OM30W2TIf+@Q3(9ii)Ar??SvzIiM|7AusXg&f6!c7T;pU^s@=b
z00Sw3_eC3t=*hPz-8)wxS5&FGlmQIHu*U`$dWF(Mcw+@ksSy>|!kB=8a)4a7`ZWx@
zU@<p7oYyT7*7pVm(%^%#PZd~Z0w()i{kor-Q_Q&H-wHU+Dyn(b_$7TGBAf*Z4D<t5
z@;*NigSI*SI@Sn%-8_5qf9{gQf-h?hbGj?!?XoW*zkKA2&>}5qG6>N=b>CPA))7na
zceHUh&Dd9VsF#1N#?!Y_Y~<&}Z9?wn2D>bGeJhBTNOl=XBY&#y`lFfMepHU>=?KL$
zGeM#xJpw!NTj2<%hu?!AybkLd6b-Xfo$@`Dd734dPLP_+;k|*3S9=ggpV9Uu9&an^
zwfy)drEvE-Fmd4CZdg!`U#ZC8z)KWzG?CH_w_yBmJGt0yhtODZw0R<LAf#4qIDvN@
zbIqp*vY=~D<`g&?9eKZqpu6fli<1AEB?os`f2h^Xh$rqWcLWlv)>*j?9!m8SbeT%6
zG{A1?$isant@m2)MNQR};eI+wS-2M85ucll*`qqV*NKQL+m+jNYP}W6@~JADewE~e
zm0q}36?r9jWUme_|H1}NQhB6domO^7(jJRb_G$ibS)_~UVD{^ebURHvPa!zHThLPy
zwY;iFdp)eg+lG{H>?|hd4{YxgX!}_jUn{9f7G>rx`H6D@+m%Oh-^X;iT2;^>OGS@`
z_@rdN{$;C@@ixmH<P3|uf!Q8Amt+RcW}vwok00$oST%=IJ0yhC&6>~O<k+0_tvm*|
zK}fF8Zb(ykqHqX0f*r8MvxCDL52YuLcw|pkEJx5)w!Ad^B*xd+bEdKgb*fVbTa$ru
z?JPnD&E$JDEXWK^>6)+YFM$?sa%jUOV|RzNu)sfT#u-<09k-J!lyZDYHE{Mw^pAzB
z2&+Nm8W-zkr*rt~min1WBR2c=BII|_y6u5|A(+JfQXA@@JqC%!NZk#Y((6>ScV4X}
zLQ3%grG?HRlbP;x&6Q<bM%SZ2n>z(!gAFf2#dX^bPYUTsW%hFL%5tq0eozff4wS27
z0LyuamznMisL9edxC~hdGTAq<YGF}?K(G%)7uTc2iLtHd=U#nS@Qgeq7h|hz@x8wV
z7Ub1kWAl1ZG9=KcZE|Kwz$?W@a<L+n&wlvefrGP6asqndEi7WC=1ziRfiJ)Y--6Zn
zvJi2w$C5u0XlX~B9N&h}JTJMEdGcdQkHX5K08819BU{s3izOkkt7elnNeUK0!|G2j
z6lBbsw$Y*C-;J+Yc?KZOk{`E7ieyXt+EaSFB=@e3j?K4PnKtHnV1quTV9^d(5b@>E
zyDLL$9pp|emt@?O)+|CLgQeu&kZj$~N@yeyo#HdkBE8pN%(@)IZRx1dF<40m6m4MA
z!@j@drY0|~aR)6uUr9UH3kWsfd8(8A(6Q<9w`7A8Wd+b&4GbuzVJwmWwzUkV+XM?#
zw%E#>^(ozsS4rO%eOeaTcK&Fo)5$=oMwGDD+YDqUN5A63ZL%3Ji&e9&+w!x)*mvr^
zFV#TcMPfE_W0r`8DXo-R$#OX%Anhh?S;WS*jg1go(G^F~x`V^%NX$x)z0!>Tt=#us
zr?%oq%H?mNa*tkK8m6dWg33abw0QKuCwqii0+Cr!o6<sAJY4g^GhtTHZ0Xzc;l~F)
zgUekG%&0wlaocaP#%!u9c{-ZgQm}|4p{l%a&B#h!KUH81UNLkeYFJFS@iQgY#WLFl
zKw)xR1x4%DsXAS0_m96g8q1ZU^Y_U&T|3SSx|0;<?~+uwow@ITuTix@?nFM1Q0@-5
zb6(?SjJU0DpR}B-x2hmXOByb!3HJ{ckm$^2=M)4Pa71LXcfiOCd+aP?Dj2|8hD3Xo
zy68{x`|i&m0&SzNeR+nuO}(X0Icxe{JM3B=V>A^cXWQYw808%F!E1pzn06AW0A@+f
zT1>q}?I~#)xaoA0W3m}(EBhX6OQy#wGDkVIvQh!H`GWaNWP)IDw5$HtaY_GYsM5)r
zes@U|!L2}P{i48O5u|FLl-VO>t*iS<r{&6GXLv%u>u%)4iROFBx4F`B>{^sw-ezK=
z1hY>#ou^A5kCr?A3J-k$jB367UQ5Pzyqn`8aHjf;`sJXJPl%*)&1$CXtshiqu7bOR
ziTt{)VhV_)J|ThYj|Lfu;J;AIWTXRgW=_n3`_B*pL|;IYaH_HP$1({$?r{E$jM8PH
z`?p>MS*K^{QDUZ^b1?T?_ALdXpSXDS!GhK@84L!Y5Z)fue+j^lYa)6yiXdM|0KX#n
zGE7rXP>O6OLtf1`Y1Krb=LFzBhHZS@p*CKYXs3Ue!SA@7PTKk-EZGHlk1vbP!!`Rq
z2FEA%!5m(f%Jcx)g5HOJ#%my^2`Q2(P@-EYTbo5_t);f>U%)koLSPp1mi9`qkR1Lq
zfuayzeM6GGfCuk+-VG^KSf#02R-J#J$%TL62^fP(Fd2q!xfc$)eXY08-te-DUa&$(
zs{_2)65bNHU7Es83jzzA-#1sMq~<1dmsT$wU&%o-pJ>BOnrBj_G~>!N$O_S8Bwzg^
z$ScG0W#*LdTdyt2eD8<SSs$O2FgaPRv5#{^xpJI^Yx2xpu5|m{1?#8bib8l?z-x)u
zKcG3-j^m?DqPwHVe_$0<hpzB7;&ny(jtX2!y(S{L{#~NjT~t)FxUT&(=yZp+H{x7$
zWExpTr)xht0c?GTa;gdNHJ%o^%=i~eO`bsPQnAE_y30t0Te?QsuE7x0r7`k87ay2<
zrbg@=Xb#mhRqi-b;*11&rQeD*;3R5`4UVk!Y>rBq047p0t0V~pUR*uZ?{2-9`oY(|
za<?@G@EqM;!2&67x9Pfkn8=i*TQ+*P-Oq)>w&`?zHa6Jd<cOMaofOn_6qoNk0>4Vs
zWg&KMA8`T7I)|N?1G~#qL||cmYe}_ihQX^zIsl26E{S^Ce2@O;)$fA4Cgja-<bfM@
zeVf)1@F7`hyJ+r6kFd&<aAo|Gz);#aG#&o4@A$rJ#ei8;dtl_LJicMg?8zQ(!ehZs
zrhzx#F6;(XDTQ7^j@jcptZKNzG~N}qWyU4?oC(|^A1KSyofXiZ`wXbdrs0)Bovujs
zpgT=n;6~NAxZQ_YJR?w@RT-=kSK@Y=!tloMtDkPN7PBD})QvkotD?N;jH_53pD2yE
z)-G;@NbPNQLD9oNu24o23&>$aHFv3Xn^#MV;W2Mt5Cd=aIk8&sxAQm4WHe+9mJ5$C
z5_6|`SW@F7D`Tz~S$X>Jm>$Hhp1;>IDzm)Mi*dsYiTge}u_fue8EETQ$r`CKCgbk?
z_)S|(R2iS9j-)byN0##uo^E~LEr*6_J0ukzB_rRmYpEHlJkF>7?s%4uL15++1tN2p
z*j@HX_p3r+KK*Q>NB8xIaRZQ5lCc2cIX+<#VnDJnwuoYOE@<;b+6}gzokudF(x==e
zl#=d5NL-iLe*3`i4VmX|H9FAtX0=ns#NCH7c<S|izZ4qQd?E+IZcvq_dX*fn6j-bK
zG%W1gTageaX>Ms5N7>|@VZBlD3g<p5`_%A7<qrCxmxud*uJ%XY(KUszYgx0`7UCCo
zyWD-gV%Ev8&6;?Qow!MQFM1+4b0XAV)aQn7^DcV9Em&u%t>WzD3Kg;ZK-n4BleWW;
z4Mc#g;F)h9?>P7@_OieDu{Bx2gp$a1Xz14F$6)AS9|D6a$DQAs`PKufJ|m@Lnqi4y
z*S`y~5b=VljNz>KIleW>4D6Hy+N$;;og?5f(@e{HR=uSWNaCcNI4{DYL6dW+%`5Wa
zU^>ae^PQJ`Ve?X3yk{qzb3p5|TcDDb?@&%=W*;gu&B~JaJ)9i~!I6T6;PtEdPT{wo
zljX7FgIJy`4}^bvuD6k#EKrwTL>yLGu-j%>QFz4)w;V%74u`;HtcRf-@vcvlxE#~n
z(6XooM-br-$@A}Tt*>mUqr_fwkcS`rfKKm~o$_dL8w`)1i27Y3a+R+`w{|*lUF_p4
zXR6tU<U>onrSlsZ+-R%A;9Gv+L?Q|kF^}}S9~pFpMxGm!p4u>O<7w!`5GC$*Kpdlk
zQT8VtYa3<!oCOC?gwB@=RhG$?J8^LUvmxl#f3L*#l&H=UwOzlF_b~yRXVKMk%nn15
zINJFFieq`=E%gXuMz!xbd$fml^QFPu{@cRxvO-#K{UooOI{<1=lv?^6&-AIxsQ$A5
zM3YC=GUV7Q_RtW;26ZZkd_v%F8$7ilEoDbHx6}vOetJ4GEOq$mezv)4CWd&j+CkA!
zSGtgD=N%drd_^;RCFTILiRN3n_cZ!0<jztVvUHA%|7T$$-!hXSF5P%Rrz3FUsG7>3
z0O|C*T)nm9car43+8XO$g&2usU;Nh~lW&=Sk{8JiYNZ?A95Hk2QF(|cT--eBgg~D2
zj-m>*BO^>E<_%bG{!F$qo;BZDpCy}@?Yf_(l$vGc`|F6SF8yb-=D?t`Wn2(Q(y=|R
zQ`?dEP{|c$LFHZV?JWd5U37^4q78dC624A8vIA~eR%SXY{rnVt8$?TuQkQ=lKl?S{
zEmYgy7Gp~hfp!<Hr4)K8^5>5R78o@mbcn|*b6Y=(d0=Kac>Cl}jxcst&e}yQ?9cX=
zi%H<hL6Y?gB&RErCc8Bm^5a~nyx#cw3yjS&6lXebstmICQD~XOM&Z+$hK>y9u@9E~
z+x!RrBH5-Vo(=ajFqo?Kj40GCSU?lIu7Gezini*kCZE^nS@!h%ueHPrhwkEItW<n9
zIzg(4q%N8s@B0Q(tU0C7{w`E7F`soX@!!Ax+$Wernw^k0rOn#qJe!y8016rVs{rSQ
zuJuSmkgo&aEV8^!G3)>|8-N}|N|XIaOa+1s!*0&5*QY3=7fg4Xas~!Lc_q2@Q>%dt
z>~ECIq7#DEhzlEB^D|4QM_i0Mh{*hN|6E2`&Sg!4Kxf-|EVnrMP|f2^HY|?_CUMAr
zq7n$t!o_|;mJ7@{SChsYO7R+qB;U%31$J3Sha}&hZM04VkL_A7JR9-Vfy0a$C#CJ(
zhQVsI3_gub3j<^W2b1~(=|)c%#!C+!rMzH!=fz{MQx#`OXf2PypfC4LboONCD3$%}
zX?lE+mzrW0o_e~X5PMa+Q@MLp0ii(x-UcUmSHr>9P9K1D=y1#^B-3X%|LljT%$cCR
z1R!4s%ekbP)YnyoY0{*9!`x82<}I5tT}a35#M7B>Qjx;cBWl$YSh%$*{n6!Ym&zgf
zR#B|$<6iAPIN|+AFt-b(dQf3G?E2=UY$v%L76gtoqERY3F@5&7*5fg&cpfc2xq>G-
zTXEUijqiKQOGWr6unwYKKL42S5Lx~PiX^2pW&6vVzg`a~&n#SX_Se2aaKNRL6@6e*
znm?n7eXxBgf`Q<8h=W;x9q`))fFKDs9uQ3X%jtm$B-wXiOB)8hXM>+Lm@&uag3-sd
zt#_$raw+H7i5pc}?N?4u7aSN`K{wiRnf6!GcR(_cl3~#b8;T^8E-Rm#uXyFYXL0gg
zmE{6Lgvq(a&!t7K{tQcea_Bg8Jk}l^b?Bd!1HC@5|I*SQQmq1rS7Z<D`ITx~=c#%h
z)#ED_X?u^~ea4obwKuoHLFiUVDGUi?Kh_o5^4~qoB>s@-8*C=qUrn>OkWQDcXmR8?
zT9A@|BC(&2QMPPeCv89`)>ks37U4UwA5Tr3;HUdE>PN)^yh(D{2UHlz9cTgByFs0{
zz8qh^>*?l74+hEZE$}mK^lt8+D+Se1j&}KVX<wv0q&*O+2oNI9EM2m{HcX+ekM`Fx
z{}9bkP*ZYdGY}j-p15QTF1)hsZLQ@i*D+hE-7Yf_I6V~tzM17GbGSv45d4evU)nk#
z`ABRa$s)7Y|2O6i|66otCU3CAbFK$ZQ}?#eo@Ck8BwqImg2<(%^;WMZyuh(SEv)0y
zUbdy`y#q2e?~M&%()3OT!1)?vYf%-$izSNd0L#WC{s=gvXNR&9YdCma*T39J{K^H;
zlXo2dDth(UAj2~G=pl7#A7!a4dmeE+X`g+U1dD2t`0Z0h0G&$uPaXvbOya6lS?v!-
z<jb`vk%sD1LwNVO?ekr*W2FBW`UQvXQ{eQ!rkG}Kac}+75L@(M+cgi^bl0LK9FvKe
zYO{p4$W?t2Ey`l}nnG-kWgT-Y%e3%$#LXn@gm1Wl@Y<=SfbYm<jUKAnJB#wm-@cjO
zg7m?**6<ZUQk$dX+Q{5EUhBnC%51SUh2uTkb!3paADPlL9BWyC2I&ewNpeM@@ht2&
zR=BS^J0eaqKIWjzgzLtRQ?$G~13VFHBqBQ4sq%>7!3YS9HwT{=kTEScV8aM|>)$0Q
zR>&)($yxg)WNvx$CV&|~#0w1x9PMM17P#_@-L?>UuuoWD&^2Go9{j~NKd2kr=e*y@
zIW@)I{6$!QSv#0yj-rBECk*t9^TvjA?ZahYVyXW#@&6)KD88Dp>kt+bwwVl)+TF}}
zVY{=tAzP;>x!60NB(^~H3^q2}qa)aGIp#lZaH^V!NQ3t+2i)j##|Es!P;u7W^At>i
zf4Bsvg3ZkfRUe}99aGpkDzcsP4EL<j4S?TS-o<mU`!DC-(ykSKc*>Ig+XA|tt==0j
zwYo6R;XR{a{@gA<W38x`xdYZzZWDx`fG*U_n<kxlT5&zC99Q-Al51A&Cz{9>AWlrK
zIE6fmv*VM&xM<}7^%-`I@WTU}{glm~#6tEeP$t_`C<krXvuhqKKo8f%*iN!jb7lcP
zp?zwY@iHGkIq2e&HPx0r{5D;HA0Y5E6u=*H<~EqyO~}<{@8sGHo%kPk%H(LP8qSU$
z8ot$7J+00nJ{I^n(XwV>zEn!{#E>F4_v<FIYbSYSwK`0u=}qi&+^^!Lh324JC<yn|
zR$BlL>g~D*YVV_*BeBMlwq-3JKfDA(`>J>i^u1sHeJ&L5bZcyW)BS2Iaf0r92ICX9
zE`ll(vE(+x7VI+2xF#aqEtu{H$fGpEz3<(Xu7)s!fNUV83$g9G!rHmAcj@y2(@@WZ
zyjdMCyjf;HD(cz%xb-EJ32z=JARzOF8gVp@g#++~8JDN8X;D4#&zZ56NTO>H(#^`E
zp>C|*6ckM^E--@e?56LuJkApcy{xHjAC^Dq04?)rC`>+MA^nnpMb4^Qb~G0YS$#(e
z=3v&_+6k$f=z93G*zq_Ebmgepe!WLBz_L$ivp|jTP*{|KKfv8_O?h^6C+ZPYy;EJ7
zA+8t^6g8;;&bCssF3O(qEL%4fEZvX6+J;FIrVqvg2m`m&!*m|_4f*rE3kLlAG=rC)
zUt^eNQt@pCV_PzAy;C6GcOnHmpBH>mX5=8t2%97!+gzGXD|k!U8IV&Tnn{mX$odNb
ze@qd&x+ITuymamz-*1m4T4UKq)?}h|k_KnZ!{x9!Q<ZfvALfM(#h9%G+@+rU?E`pW
zf3DeKwx8MjpKlvL2^hzWH=ch|@6%@rfP8nP&>cW7`ndFg7_<nGHt}!Z{*6UO6tW|y
z-g|N8VlsLf*upPKG<t=oX@%RBBDNU8UF@IAymE;2xEIZ+Xv@z=+2UR7q)g=xEe*Y|
z{i~TtV+Zv)2sZ>sv4F3dpmA70ZVl>t$_g9*NWI~V+Yw#iND>;7=hB$#^ksxyw`4i-
z1Oex<iEfq4OZP3v5XlG@3E&L^)NFsNZW<A@rcjw_x7~2M*XZP4TI<f8op{c6YmS_c
z4Jt9k^Vt_Q5nl_XIy4>9GoQqGc+(Rc=SvdyrIbs>{OtqE&ph1N*t5!$)vMiDI-@?>
zb0SCUd~;#ELyt@da)tqo+fODn=GGMWkfRM&%ptN#K|YA*!Q}v6_$cbpgHHD?(kW$=
ztmYMl`qu!1G5uvk>z9kr5^MkFi^0l0BDP~X{E``Ccf`9*rgI-M<1aZk?0%n~eb<~m
zD7h@7O8={l)c%|wJVv=Qy`?Xl_rdpqbtM_9?REdkY*j|oq+wi5k*p%X8_VNV+fe18
z`KoWd2{y~{;}x+vNM)JGYr}2zzDBv%(NNNI{~@057kM9R86kI8z6cvHY+|LrGhVp4
zX(OtuvcL7s=b@sniRSfndX`xMsQrqMP#N|p7mw@Zi8~b^G-<`(K}=$g+a*Q+(78>|
zdv5hIq7)%=VzhK2JDqCn>sW5Q6{ruOXG1*dmz5&uKfPky^h$*-67Aqz04xl6;WVD>
z;iTvqp~n^oPW#C&tJrl%I2U}Tr)YOeWbP)1-PQ3w<>#2=`+&^5jg{Tlv?vk#BwI{f
z9xxm1)F2~QZ^_r!OJp6+sEiJNur115nGtO4VQrzq<r5us&!*mA3a6QcoPNojGek=S
z5FkJp2L0`lx%?o!f}a85+XL2u^OOq=xu_#Q?C1BPb(Yg)V?O$?+67J=f!R;{>I$~p
z0^N%UPJ2_`beZvQ!T5^1VAlHdd|8;;JAU{0OXl0OxDi9;JGw{JHVT3TKCI2WC-Mjk
zM*R(x=e&OV&E81g;Jo_Lq<yv~VQ@A`o8`pfMRKwN&c?H0FOxLgdopd#C)g+u)iz@K
zs>z-zZk=5~&YTY{9uG8At-%Qe=biZ(kZmoyZ-%5&{L9&QzmMouf3p+31zOn-66H(9
z?YXMktHXUeCestI#V=1*ffnm~A3>$;v+i+G6K?H;w71+Lj6XNI(p%RIF3@-~jE}wH
zb=w37p=u(%&c4`6f)$1~yL6P$x*uOPzR(=1Q>C~a54xL`;w(5=RcslE#_<QBY&hcJ
zk;kTv4tv$=?X+#Zjjc&&L%DvdkU|kA<MvMgsW4S5N8||&Zk}bQZO@Rpp0IfJQ-+cD
zid*NJR}l8<45|VZJ3C=TT>nN}K|$Q7cuBR`I3UFT?N*15_4UJ$4$s#X#Kxqvd)}zV
z(<u+Xvl@)o)Oiu{S9+<V!<(m8p6T=rXGBCb&1hxyXyyw%mpeSpPKWPr#PLU*lGU?~
z5$UQ1EzH`meKGG<;%zB{Y<wkJD&cyYP7cX8<FTS6tyC|ZMooa;5!_#LmV>g$jUzMs
zB~#R!Yh^Ckc};I+7O%GRKxJdN*R47ni9;^ZI5Syc%pvG)KPr8N^OcLUaCu=oyZ=4@
zC|tjkxn#GauVW5YV^#n#khQoQ(7mCm4H*pUzcmp#t>EP2&?I)@#1NbF@gBhr0S}<E
znNKaxFCewZ)&t5hM%geX5#KW)L3NaA`4?4@kgNlct1;&`JsRmZ48>ebnmzmT`4jQ&
zP>NwQS?@Ng+cYzTyy0a2=1T9vWj!#or|?62d&|W$pq=#&(*mkNu^zJJo#jS+1NUEp
zP%T=_)YLg@k3)#+>9>+cb8J+~6bqhSYgq~jB*;rhhj{%gbo8CYVD(FkAO4&Gs_^|K
z(1aBa^621~W)0FC@<olPLrxM%gMeK6?}+<=3UN!wTQN4#=F%R%Xc<nvoAs(ZSfF@E
znF4tLk#9yR(*89o&5TreyXTl8-W+;5Uxrujgs3KFu#z{dR2W)No3k}Z))UR1WA16s
z<S<xe^+_EoYQc4+j8IZ=DuKK|zQ`4)CU0?wkJCmWvt<G>F|T>nIuzTk1eNMa$#a=+
ztHW<4l#|nR>MS$Z#@8J!T*6B`7l*2*{;R!dyd=ivNy?b?D8E;w4!~@#=Ks0wmgK>B
z*Vv{B>ZeTI%bO!E9bRop!>h(rg`ubKcC?)@aX?A<iq88~NX<X@(bSvKvtupTtLYg{
zG${8reAF*TTCmdE3${j<(ZRm2fy$Th1XG{}K7u}%y(K6;lva$NUZbywc5mn}uF`op
zKYSnBM?crL@4kR;9UTDP{GB;DsK~V#S{-sk=hnrWn?_V<JyLJ2#@&0IP&GtvY~+4#
z1?N!48&-`noy_8u$#&Q)n6%NaYIjWEi9aPRrU%CK``o2$NXeYc8Y-Z{12n;ze31$&
ztc6ybk4#o^I(7RNRuu=<2j|ed^J%gI%)lw`XVuW!dRZ}YfQY{SZ@m-+B!}RCKsyMP
zwb_7vs`s4*2lXe&K$-$Gg3ls6no8KQA_d-kZ$rv4yXKzL&vUYvB+G)31NQ?ip=xff
zH9Rv;@-A$P(!+kHn9MBZV0c{Va`J1bWzWv_H{WCiDn5JtS9;NPY5Cr;SXAf}^5L?a
z1E`m<zQt%j$Mv;^-Po^~|6J<ZqEL$IFn{e^WA(tc6<bM45cc&G6nD!oO&bAIt6^H@
z8YZNL`XkTC;=h_qopx3z*HnV&9&+qPj(vd&R>^E0J3_h_YIE|9$|D?q$nH{j)irym
zZJ5t{5lyh%w_Y<LsDH)!(?EV_`5TCopTShwG+TYx@&Q%tmVEz;o=7w{xV01X+%D<c
z{jF~=$|H8wi(}byJXoelxIwpV-?`bsHlR$6Jkw#1)45gplrq49KlwzV14@1$s2p}}
zS<!J$CNhX|MKf_vg6A$({!W#q*^%{#m%uiATyWQCic*V$7uO-s#RqXqkjb@Byei{?
zt4yExrVo5ym`@GPa>h<i7p3rP^t0ziTq<VgZ7m)}PyScuMJ|Ft^Knq|kx8I%wqQfw
za6^A1#Cr~mYs)TB8NATH!~oVb$#a@(nJ+L5!L^j5u2Bb5IUo{X@6FeQGI{AWlU1?n
zJ5ttKEcG^u_3L!th<c{+#Hi#=Q7(_JU>~WWN@pT*RUU#T5?dDDD?U&DVVioiykyEW
zN{yJS49ovhAaC&Fm5_-(TUxyWL2=Yk+>H$2WU=&9FLqGYS=vaCl%(Gt!&VWdpiGr|
z0}bmu9Y|flQs|)zxx<4`j=uFTevBY4L=Qg)xPucMIR5zhM1k?=)f%m;9j}{AJD)ow
zkqtd)Ri7Q@i!6E8FvCGfcb&QqHp;hLD5>`rr0y2yr|mw5psL(aV{WiL-M6zzGP0-L
zQln6CWlZkyG)K?sAZr8DkdN?e)l`6cLoG$`_mWtfJ^rlut*E>nUs86tA6tDx8(;Ub
zp7WhwOd;FjG~e_nowpLZ_BkUy_ck?gsn>`Dl^iq>)r`B-S3z6TP<?-Nw&w4%S!CvS
z|NE@MLWir}1v)oupy59a@x2Le{>gnHs^`k%plB@D+)sxV<Q2EFu{@(IQWt%*Yuc-O
zgU+V1?EDj=c4tn7q%kP+{`?uhn$b4xbda1&B=jaj&v@2pTK(}F-Sy7~1FUE>gI|xv
z|8%X;G8Wz`KkJ&}2;Jvl479so@xNbb)&9=9C>`NfEH!o~p^bbEZRvID9lCqXA36D2
z_V87|v-kIZJo|;&+rR&X54AGgRGwbvaXHz)30D9?m5M~KrnQ`r+KZxJeA57#i@J|8
zEj)kbvx~+`GcxN!PZ8^dozfQ*{P;`@3xQs*v9M>Aq4hWCx{aK-BfX0_j<3HSuueZn
zuJ@Z7e?v&1;7Q1!{_$c+V-XR&Ejyswn?<ACvAH0BN(%%+Vb@oqF7=i3Bf_i^+wP;!
zg=#7DT2(f(oKBDUsOW>@22^|&-vCW^$nw246rAj^1iRnV`3E;K?d7+m3nsS~@7vyN
z^yYyopSf|&f2><H3G_6;W(kxuu%Q*bM_rjrv%R~C?7reYNjINW=<>vOy-DmUB1x{g
zftm9tfv(xM2tIvSY~Ak<{e!DOvnNql^y^ul_kLyljQOZAdohIEqGAaY2?39^%IGOK
z{2<%ukt%<d&4h^TN`!%EIz${ZrkW*#%>s7z`Q(Gf&_9r{rk9_o-v?hmd8qgOjN5Hy
z$Ys5jTk)@kbZxT>^lI>dm!?B-w89+4{24e*2$B+jqCk}Gq}Y*@SnY2wfe%)Z{1V?m
zBkLav#jq#Qz_nnfRSSTg6QeWmONa;CxPG1>8u~y%TNzUZ@Q{kcp4(07ruzSqS$KuN
z7V#MJ#0;oDQspWiSx}w9M<xqw_r*hJk8$A%G;Nnu)EL0rYe7WJVaMKoTuMH>#Yp!?
zP~VYr$Pj@%8;afh$Ofjpb1qI;F=f4ZyNjAIXOkw1&5`p4Ob20E`=F=xH_rp0>J#9k
z$i*e_#~_*5-ZmS=?ZSrG7Tc@BiSJ-D?U6r-JATi4RNOXau)TjJua=#X!@aC+$@Y*O
zzvcox=rsTzwBG*aX3orBgF28SS#k=bc|@5I=XKdyDeXPs@}>-N$rqWEPw#P^o5OwX
zWq+$jJ6Y<os@0k15;+b~9k(*Lmy!8Jm`M;}zjOAWE%*;IrhARTAQ0$^I!QL1Fh&kL
z20LfQ#`|83^}R}sk(F^^GpjJV^{CypJ5JNosabT3muan^EJMWX!Y9}k@kmoy@T4Oh
zsb>x-z3(&dzh~a>7A|JL`cIqOZ#drfD?ktN@8d>dg8_gQKx#w&ZEgE)hX67nbu#~-
zi6JDA_`g^cK&7B^XsKm6jh5*WpP~MZZ<jZDX38cib9l!GyJzfx(m=}c-R`)X{*Lv(
z2!G8^ja-=Kos=CxPnQK@bub2A7CY|e7~vS{2z1{>-oKIht@r|8(+_ei1j~v224uZ`
z8Y|u0bzRnCeAPd`fGWA*7d1!cUVXk$bOjASW3Tzm49(kWewIAW1y1@>58S;zT-p)<
zI`t2pfzDqX>@qj(`?y~li68?#06qwaAdtG^N!63_{|V)l|NayiKZ9qI<pXXdrplNj
zLZX%L80Q+<5A@vA$9VQUL;p}a_CXPA&qpf6bj{o+Zp#hq5Uwadxa~a>b>{CPBn0}T
zW;AtIvLAoXm<6gzeN~LX(aJ%|e4R9KoiJ7Inn>WRDRA~lIb6Yb>MJdGc>pfx*f=on
zEBVg2)3G7NSwKTSGc8;v8H7k<iIH;W0{ls!IPmhjhqIEJ_}>D9ZwDmvT|3KUp#lV_
zQ;di-QI0+-fzzJTl7Ndo;sy-<a=2QOJ|6>v|JN=dInXU+r{3M-!Zh*B<%TaabICqS
zZ^PBGgYAxN1mxU`#~a2$ru93M9&C~YKc=?l@Ua6+z3qIRTKhW6;q+XcmW_|(@^%=G
zI#-kNd7D5!JNwnUG4J*>DYQ)(?NlqUCAv6-7*p4SiVvP0TCyhRjg@+-E<d)VOazIy
z3Q^i8_4wJAmZv<e{RAIU^G{m{_-oh9;f-F~r6<Jc;kK%;UPxT`)a^Yv0}<DH;W}RG
zH9++ODt%9<zq5)}Q~I1HtKcNFPa1D}T~?mL(pNRqNm*{;bT3#w<frve(e|e&dl?jM
z4)Nronlp_TP0X~_EvFz;-6|0@#TDs{T<BVSMgkNzOi3srG(HU9#i#gQwCb2iy5W?@
zGt($hv6{@%7ex=r;q2xZ-4yw0K?vo*ozI<)h~hXZ>yRYLn8Kmap(c+k7?IMAF!Agp
z`*{Wp)_W?@M||Jj@%;1+FlWGv$?QJ-4cjLQ!zW%9ZY$I9G9uf;V$&my_@=Sg3vJ(r
zqW6tSKxLalE-_+4W@*BJmY0Hy2jMxf_~Xex6!;3li~JbD#J+j97$|Qz{8F4*sv{TE
z)hrwJ%L(^2i$Mpbb?<Eh`Dz@^_2-2zlcUsle)1|x?A!t^5-D+exQKQe9HPe;a#7Z5
z!%ywKNVv=u;?bw#ObH2b{)FpqfNLHFTAvb3buGrc6ji#YDsB3Ni^!!e`Esuwqd+Oo
zWF`M#Sq06bLGyg=!IJ<*%GQRCD$T;1O_7bFmKIK>mlkDja4eU~a>n`8>vB)MJ0EpF
zXrgO&Gzz4)@thB!b0)LV_5IF|^hV64E}q9>C0h-p+crb*hhOq4ie_|`7P2@4ns?wZ
zV{;0zQ!Iz-<ydw<P~2|2uSU$Uy^1>hxq)wmpQ&e^y3KE;cq7P?($QP~DvD#Cdfv=S
ziyjXgG=oTt36IcSMTW*20S{N>Q-n2^y4R=lzmM@UP#@tj%Ra?qr^|NwtxWlobr9vn
z_ESug<xtd@+f-AHzpOR(2DB~`m5f*CyE#qgAUQquSA&#$HlgR`(1oEzk3wbTikT`7
zZ8j9gW~kF#n+TY?J;P#T;7XzVh$JnHOZC;yQ=D0{^5^Psz6rg%8MOnn)ws-d%2igL
z&$1$8C`X56pg;Ri3r!Mo!($RpI<TTvv%8*3^*%+0g8U0rpokExVu>K$yk@1_?fM2_
z!KgPiQ1pE6Ovum6cXgtUP3}ArbJ`L@b}x(LHUX5!SQ!&Z7-AgiB6#|)|2`;&Ld+DE
zMuub71?f*?qMV*?%#w3WD{g?g<*n=Am!rOp;&Jb7d<@_A(*&Fjn5g>u3`O|0w;Q;Z
zuJjtRkD+r2M`_P<eOKVJzgqv=_ri{owF7vQ(hU+ZBR$Fin*%#|Eo_=h$wkq%6*_eJ
z3@RrRLFmFtze~x*J1wZ9Wz~%i%CA1vvOA)6Rx7)!^8xF^!hF<gMGuC<@xoGkZd-b0
zSVvIv{SvRGVZVWBHpJD=wZ)B4$MDK#N5*okRq2l!y-ZrK+Os%;W>9{otwJQ-f?ns!
za542@$O~*X8U=Xg$;b2xE@WS;2N`K~Sb5rQ&0QYswd#$Tff)kr`<bZcvyiWNAw^I~
z-ffCD7Y7Lej7!CHTKhNB!5?ah%`77V5&h6@7Pv`2fvgZ^Z#L9_y#KWill8c~nZ31d
zh09&@9md|?uP$W6Neo4y%t*!h^bn_PVrvluW4k6T@_RS+MhH`H%up!<F^tWQFFY0v
zdR-%goXB^q9VUa&E@s@L8@hXvyhNgL&#K~Z*-r~cMs;%;4Xb8}&!1nd1Bc@ec|0gF
zXm|8mt)ok)EIO1%s*~9=!IbMEFYV1cY+b8pkOeOEJ(<hN)SL~6ro@e&M}Tt?%{sla
z`I1X?aH>~DMBug^Mn=vNQ!v>~&SHYpP<WWmk8p0Ldq1RO*wC^ny7Aly$LHipJ-M^P
zo7c<zTaa$sDtC@}s4?x#b*l<S)J^kF3ek{qW{u%~IUhr&My$~C_Ku(f@mPPPe{JF9
z%Fm-uSeD(idr-dW^g~sDyUnt<nc$cUiuij)4UoWP8|k~oEn>=>BMldq-lRkMZaMCC
z(nR8Z9lBL7GtV%~N3vQag2i>|jODdI9KMKe>jZZtFeb2>F`6I?g1_;du<jOvI*Hs6
zu2>Hsz!Z1!{+08xkj4k_Z6e+9V7fCgaEk41yoyg>X^>L?2TI_c$o^grTN;KZIA<+S
z;CV*y1p`ewce9EUA}Fu|clqdQ)MrX+r+3}=Y`i#8vYA7A<&n<00_lHQxkB-;$xbLP
zU(vS5_BiST%_a6p_ud{qO64!g8ZUX*hfHOwG>s8mlgd(fC6A{&_$QzC`+?o(p#2@W
z_4>M+{BS?7c4{_Y_r=xgD9Iyy9`JQFIQll-nzyj|PH9hmMuDo^CV^%EwYpS=QfR$`
zC*NB#86tx0%47vaq^fxGr3k)KbFwMo<LfcB_J^9Ps(U+g<-BY$V^2{%f@}Nj6^4tI
zj&7ekxEG0j^#s7Q*3aZ~otr_iHh6zS>bc73Dlc4}t+At?H~rf`DurI7_2H^*(P`y}
z4g|=>{P4-w3P|Q<y_phSv?InLj`|hW<c9lcg!;qNbFlr8C*|IlsIv>~_NE0o*JTUK
z#oh1ueJES@`)06vRd5Wost4~uQU>E%i0ZSvbvWHC`929-%~#gPa{a0wiR5crqLIL?
zbWs;Y#{aggqWe)!H)I=baBsGn^sdS*Z<J)ugj9k*2{vAma_G1-6YinORx|Xx-1~S}
z+TSN#^{L6?hhsDJ?OR*G>J3hrf=}?W_+*v>_iJc>TN99Qgo3whpsO{(UX*6vKKB}(
z=@3Py<jkV+vX)(!5==`8Y#|7o&n>o_BhWFF;mW{U=Gb+VqG^GXMc&b2`D71Ji^{r9
zjS=4;$2+vw$q;peE885|FS(iLAn825O|&D|)TwAwsSas2GTttpCwPVS-bqH;U*+i&
za6eHQSLvNlzd0q!Sho<HphGeA>hH4=3J%vkJCLhC+e%H?Wb}ZcPeaGAnb%LWq3NTU
z3|H0C5-GE<O~Q}<xf#{pyi$?wrOgW@xyFZxGjbh7&-${EU|lGNafjTXw~MiT8l4<s
z)3co5X5h|BKmGmIyvhPKR623&R?I4jl0O6p6U$=~M~IQ`bG<s0HT1#WeJWSII903g
z->KJaew`Lq#;(d2!<g-AN!2PsKFN=5&yq{?FkvDWdtJ^@nRV@q4Usq0b*VaVuhJOC
zIS~A82kvc#A~Or|M`p~fXaIppz8*&-2f>?fu&p0U_)Nd<G6NUDK!H42LSIWr&ok+9
z^c{(ZtS6WK7=P3^7G7e`^10u-ChWY|N%4CV-Ai9~IpzeUTIs+;92asax#pTfxrX1b
zY=q8t-H<bv9L-=54XK}&*?D;#H{P{gC<T=c=I_OTk9WyKHg^~oXzv=o!TApKue_LC
zdSka59kLrV!*iFC*albJ!bp1hB2CS->G9_CmFy|T<xcyjDiXn?83oxe9Ijq}Pg~8)
zKNXe7)<w5^9C2jl6<z^|!K>X?MQTG#oR^nNnYN3G%8$mRqF+e8?_qC{X)K$O&L@V~
zSPfAQRUU3M2}gJ@HNFj+J)SJ>8w&EQ5Iq~g2)AMw*9^Nhw4568H+a|@KUOTaHJ$Bu
z$;R@PTqipvBnJe*m_uM7!VOcezCgf`gQ;}S1g+LWY2A4-TQ}ue%1`}96Yr?vC>f4_
zJXU<H@zDvGu4ldd(8-J+;pCv+U;yd-{{)l#H(^oVTiAR}@14o7GO{ox0LD?8jZ7L`
zeGY6aTt^aofVOH5;UZS&ry&V0(uxgM7~gRE2M}~D2eSgmc)w>b74(1G68PW8tpA~_
z@ZV0M`&Wy5CtChxhjzMfg`#TKtAgJ1qK*#Fm~XbYrHMj3pG^K&*fvE<_0H+2(4Yw^
z2hrsd9d17R*X8l%w7^U~mYw&Pc~V1bdD#k_6eMG%wn;bE|F?v;|7u2tJQp}zHD|s~
z<i_h{X6!?GY~nup{d9nSU)@{$WF^Rj%~IL>@UcYf;8dN*d|CW*ua_y|Em?-S$MTOK
zl-d>z3qs>95sz!Gy*V{Y{^|K)`lUivSO0MQ+HGzJN^BvnO~&O1obp<Ct@fY{6nD>2
zVmJH|c3!5jzSQdD$?G!w`8LhcffM;yDN_*8Yjv1h059`hT)NuzoUHMq$=dRFjbYT?
zo!w6Ws=ob9eDy@w1H}Q#6tQ2ixqFsF43@Vf5__yMJLJ)^xMPtZ*_9-Q@CRu<{=YK$
zA~8vu*=mp|$SiNIy)7l(!eS_Kxp89;E2Pq52nG6$O4Q_c0KbsW3_qFo-fd#6b#Y~x
zDnNEUD?MRcCQm7&O}CumCHH9Up`-kv_EZ@OGs$9L2#UvjX+&lytI2+xraf7B+QcK)
z|B8LmU1>gYAhVy1&>pm@`6g0P-%7Y8*XF!k-rf!Cr~O0Jw_3_I%LW?q)9qxuQgut2
z|LMz8l*7Y-ORsfOg^(^-Mbszs)>);KygrkBA3MNYC1tYkje!tHpiJspZ}Ka*39A0^
z{y3G4(PmWF;u|~H=nxKd+J|?|(|7f(jN@0q8hz)<ot33t_C5#C-^2lTKjuX6XwYn{
zvN_5W>J$5tJ6;J{gK!6${11V>gK~>uN7IS<9@rNzZ3Su3^g!UQay0jkNr?vq0=;fJ
zAer^CM9MDM^7fKx@1h@BgJT0Mfly+Hj*NW84_-^UI3o`IY5e&Vz&@-{v_67|ka=`P
znSIMLm9r8$q%-mqzTV2FFpR-p3H0}>_oQ=?%9byGPm7`4zGTlnxq7?rIrHSsJ@slI
z*1~UnyCAw%US*kO-%7vq-A=z<qTP<I;#Gng)Wh9JRmF7s3I2Xozy!dk?2{qj6x%sI
zXX_q6X1g8tY@1SpxNs-3`LkT!OK;$1az_W#@#ND=6N$a`@{0E69(Gz6!VtqlCD-i7
zD@63bnLZ*nuc|jpUtZo>37c5rW3Ici?L|hEdn2y1TbIA3_WAxdYpX-nU`_e6Ly~hu
zmRFh{EjGW-whFHRe{!W1WC45ufrksiY*57K?ltc9sCs-|xRd>ei`9x@h}`fy9iNn_
zArTb55_e_&#S|T}TL%AHUUF!4)}00DNIArIM!sZ);gKBVGrMF!@$=sK#zkd$$Pwz<
z01k4yZI0`-<F1#wcLV(g+<k4k-Rt~{c4lR^m%S*r#Ih<QIIi`sKJ5NWYxd))iO>P+
zTsf<rCF5nW?MNS;Lz%*e*&XFToZ<A|34MpnhDQ_T36eead97VnP)tO8fRs}~a;UHT
zE5gk+4xY}2sB)_Tl&Fpt`Wx7uV~U2@yu<?{uFm4mW;<DPtVQR7i_`At&ZhvWl;c4v
zW$v=N9Xyk-xaT!06|xb9ztk~UgzfumTo)AFS|bJr1#ZOWWJ&L}(Bebpv$>|S){h#t
z?aHj(2(IaKXHLULQXtEIWsLOa6^@WKP}Ut&Y*O}nxl2~)fPsFKl3~U$qQkzCjcFB*
zSJ8ODE$cB8?Xy}l3mn5^7fojL0yOVXx7|EMynOlH)qZLv;U(n0qN6(8a<ozw5>$Fa
zEpiM+n_f*wKY7W%s`Y*sEHt|X^>&P@y~|p>QBcw|XlwdRPq&NDrS5fib1mILu{xcu
zdgm<RuT^8;1J`v2pdz9mk7hTemF5R{@;@Z!RJb3eO-S%7l{2+EnjGxN27gd+p_!=E
zW@ANc1f=f-vZ?CkGIOALRO{-ewt3?Pi#-_yi)_;={e`RLCx3WQ{b2;UF`J_~0q9KU
z6GxO>bc<X@+-;Jh7=I+V3di8(NUb4EKt?_kZ1sunW81=YpCb1<y=dnVOe!5`2;iYN
zr%JC+uSIYqx$~^T$)aiIHys)w;vS12zQ26$9iaYXNR+7cvR)B|{ilWWZz_6Djtx6%
zn#0{rU+<{4phQ9FNGq|g>w3MDP_cDC(OK?pso5J?$Q<?nzgB&lmRR*;bOmL8JYN3T
zo7%%$m`t!!#e><dOHsIW1dK|~YzqR~G6t522@g`DhD*|zIt^M_JuYMtzUToBmX-kd
z2Bdn1^4DU&XHL*8(YmDT8eRt^{GOEEMGJ+i?1J)$OT)UWCxB#pyq%Kujq?O=TiF(6
zgf_+4gxJ&WUCPy}R}j8c5=z{@Zz8x#Xmt=p@F*p}P!wL6%B$>FevCgiM_KOcSTbqI
zz6o#&Yr>$e%x~<|xu~S>4}$qlf>3~oDwD~ZNRvzcsnIcdqB||h<Lj98kANK>rZ#^`
zJ?JOkf*Q%N_1X+w#y`2JrUB<%7y5x<|9p3?d=>ha&a_+5Mlv8`u+wPb3f*VDuc(T>
z<U8}9`SR0&cDkFtAONf3+5QVPHBMjMGbH4^_|FS=^5K81EzQnq;j1zN6=?Uj@vcr{
zTukJH(g+!UOcQ6^xpsV+XnRB)&FU~9;rJ&v%d_1WM(27OvK}EWgr_XiaQ~vQlHp}G
z0K*d!*rv6#o+Lj6`g5GjiLDPFYOl7CKVhm|Lv0uz)unZC+bqyAv#e5tMNT7+l)CIu
zY7{lr1ODOM&m_EB#Y(|<i3hXi_2Kz5vOm+hCKOHrcdj>)z!xNRYz0nue?`Uon40kn
zmt@!5ndAQ>h@wyDJ?K`vxWyyQ5+9#4ovXo>BHR~<F%1RhYTrQN%4;jWmxxg~^NzIc
z>Z-_IqufsW9^<EdW#F}gdzZ_vfTNb#jqzi4NMNj}P0vkU>A04G<qQVCZK*#BQ*S4~
zz%^f$_8e0w>_k{&J;BN;+gQ0acx@q~dUiAbiLQ;G3Ro|{J6HnT^CXeov9|fNYM)(F
zlnd`WzYvfdAk-}CDZa23RDDF+Dal?*U{QBt-MHV*V<mM4X;2c^WNQQe6PY~7`gJO8
zypo@5CoZ(;T$M3XmMz1hVD`jGcLlQgt}Ftz`zW?@jW_JFtD8xj{vGn2%EJH@cNP)>
zITCZ*frL~yxg69qUnYQvx7N<&yO`=0%^=ZEAsa!N8!@s?Hp7y9yU4sazlQNFto*`r
z#I(fAWufMNUo$@>Nfv2BQegI!aa*%gt8<;Qo1mYW@<LYmMj$tP`{s;U+k$yANe<(+
z`6*K;vN~hA!O#D6p0Nma3u7Bywwv<Otd+%|HYa18L&dkxas;7b2UI99xdGxsYa1ow
zYe9_VQqI=j$?KlA&?&nxK&3DEyp-p#XAfXPwbidZbcYvS_M0gGJHFnT<I~R4qdawB
zgd(D6WV+HzDEq@*2zsqOHw|%_jY^cM!BBp*NPjtw9q7Uv+v?UKj@1Q=&ZbOD=WZB1
z<lfeq8wd=Q*_s)OwBlu~6Dz8fYLWqr@dY<hQ=w<&Id-r}vhpVr)FMbf;P$>n)(3eF
zNZR0fo~*bLubCx<JmQ!7%bvqQlIA~bp4z?cQ{1d{zPfr$6xxq^<-KhzAJZGr+W08+
ztaqnXo<z~hd9qdy$fp<G8)Vl#Lscp=Pg$wQ7DvBcFh3KKm7XKXAx`tlH1#6Jhw=tk
zxR{M7y;51SGg8uN%Dt1OnCcS*2JRs)cy~_AK>+f}MGteadwtCWE?oOpxkz#@a)2<{
zV8uN%AVEIlGy43}H9p0sy~tBq3m5lfx@3rJtbJ`2ib(Vnmy;>!K0Q##d5W>k7jv5u
zI~!HTfu-|IxZqtXGD2G^ywf%U9{TL6+5ccI%{Vr4g~z#pKdQErd%a)^BDYbkvgZ9y
z1)QGkR%?VDabJE(6Meetq-Y7{GiqMTr=cW^w0MJQVn>5g8~u{`?3+Yrv&It@&GXqC
zpl$)~ESLa4i4>|#Ucsjud`fW90qe+ZeraN`sw^hZ1PUn?S#1)@wc8e@MuMV*JGUH|
zfV&8j*-BjCQ(S5X)x($EsrImAfu53IOu>%F{-ttQ*ydB48FkVZB9JZC{?hT=zmx&5
z$Uz+%ImaX^q%L($^<PgtE%*QVI^w^At^Z~zlXPDYn<^tXwZ7%&2?M?%r3-%{edtTy
zJ$-S{=ub3lp8BH8kPgzNxc#c{e}wV>f_b~O4FOu<SkGNo64;gEWm5<{@-vFWe~{q4
z+rPc%_iqvT|M*qG{|8feh-qROAZOofH+)VU0!r(4*u5$Z2k+9>h-_EyQ-LPmFV>?|
zF_=&rh1|V`f;*upQX$L=0|9O}SRK?iPO;eLr6A^;F>Rarf3+A6Ww^ltnMr@q7rb}U
z>NKfdt#2$|&#ZvBc40z8iKGHNeH*)iYSDY(3k;3`e21#V*d}mIlJ)fykz=Idex4ww
z)sa&brxTaXohDtG^az?bdMf++_e=FkR6sF1yO^}7zOWmuiH}J__pwHb1V)?1bZj0!
z1{A}iYyRJR`PtY+rbi~)n_rJDFF8YnEesZ46g`M}Rqt*l@Ht;m75)!3#F-N;^lBol
zJDA)PQNs(C3cZVOjH03ua<T)}F(eRmE-?B~MvjYPjYJJB^No6<)s^;;XU*pglC)Ki
z|2bVuM3;&yJ8DPlw#=z{zdsfB#kY;=q(KepA;X<X6+ffA1)qC+`W8h|8|ht^CTv^$
z;ZpEB`~_Ppk<TnIIqtqOF#F~b;;!u~UF3F#RISpGlr^bqeH813vt?*RTxks~6>6pQ
z`AFBjA+%<p`9w{%O{eeZ@R@gH*g=R`?0zn(bcIZr8v9uc*59>}3S+i$RpgM2J|!$F
zsoEy@SPH$Ma*ByEi!n)6(R^CN&^&v^&(R@*gH+D8=y}~u<g}&y`3P-Gjw0mH@v62l
zaot~~Oo-&*XA<R_pMB`>wOg;|)>u&q80UTgE(X*%8ZaB;_pY63iMF`w<LC}wIcq&n
zq$o3^zuX42{_2Ybqop{TDOzmz{`#)J$zE9=t#XOv1h2Lv%JR;L`!P8t2bUdjfT<?u
zOwhDDLIkc<f7{;!pajqRk{l*hjZy=}C1+yD)nrGuZqpCLOlofZ%=#gyNjsDfuOi7*
z9ZVXIKtIF?_`A`s^c~rn@Q*Y=NeHGw%afUIxHx`QRacWeX7l6wOT`H})rqW*abQA5
z_Y+C6KWwkw@SuPhy$`t9ysK!R6h>a00e3zd1mWj~DY{|dA9a9lMA&<byL2ntWYo-j
zLRnv(5PIY?Um_QTRsjM%0lv|YtuZJm2V4_$5(%~G(Ft%<Dw2u$L|L2#OhYPms!d&6
za1mq*R91z4*qR6{R%+sLfHa<w+2J{|*c9HHS3L?LO$$_+KBnAwV9$BP!F$z$g5msi
zlq$R>o6_ibOSYiqb#4L1>egR|naX~nw|W**d|_5Rh)EZwvrF&(^rXL=f`mm;803}e
z6b*I&(6xS#sZx@}>$xs*=AKP!-EQ)Cgr0V0i~ozd_Y7+?>-tAWnSpVnjKd&=76k#N
z6GS?Uq97390i-t(0wMy^rA9?TrAY~dt^z@%X=qZTMj?eFU1|hE2`vx;Apw&AtuxR2
zKIh9h=l!4ST<1C;_`;n#*?aB1`dYtr@u_@O0+Q4DM^h>E1nu3^Yq#&D{?}boVSOO7
z6dYG!fz?u;E}S_&=2dfaNEv)2`7MzkS0v`@0?_hDvR!ce*SHKZt_;{i)w2|7T{odi
z<C}FZ4Mm%e%*y~`xMiSslkcnJ?_CuYbM^S~lFNE#Uviqz=)33Ci!86BO*NQT=12K#
z-+pLp{P$%#A;`%y?D?G=^1Nj6z$%7iaD)~X<!Y$*(pBr#X?=p~hd(I>s;6EmJx}nt
z*?Dz|>%Z?*Xu`h4*h#O%8OGjpD|O`hqwB60^(8_#btguGO&!YaVOA`ja)n+0e$T(h
zy`(a}feg9jHB^7%`h&aV&KuKhzK#h?WbJc)?h6u%M<%*X<t+%x_$mGe8*095j>{sG
z7@kgte<cPV4gaZ2tfp%IxP4ip;(NzLwccpD32=Gpf1LV{GNJEc_u1}Gf<N=US9D_D
z%CFY5iTzKJ{0x8-n1A!~&qYs!8Mx(9RGIewK_X8iaEhfR>nPINafZhzY<S+K{N(?X
zxn&*i&p)DSuk4#1#FsD1{{K%}|DURg@6ml0CnAHhzv8s`;rC1V!YRkw;%VE90HXu?
z=J#a1E&vw*<<HHR>1Mdaw9$R|`^)R9ZM#n|{>LYT|1-3oI-FSP+h0th+gjzN{^<Ar
zlW$LQujG{~-;l52z7t3zq-^}u52S#BdsJqRuYqmwC9xzw<nDB&N6XrvdGCK=-u4^p
zul@7zJJ|4dT;TtB2+MV||NBj!9^~oi*6=J%clh+}ztMHizNh{hU-$PUaQwT2gC#>v
zG1>3t$+_P)b(16_96ui!KjDW&S(PYL;hcTgE%W4Ytsl$s<uTxi=joJFmf{nfdvIL&
z*>uEY@6?gMpZPY|#a`(%<QzQRev;;heW<mF^dkR26#g6g_&*SZtw&T(U(f&ce(!GD
zcesNhi9L|sMe|!Zzdzv>Jet0=nIfQKyj8dKy~=NJ{2rY78RU!GpJ8-fPM<xvdK>xs
z-P6AXWz1VqJNKulIi2KVq!c{I_|f;TO65K{7W>=5Z=Jr!0Gwph#PJ#XG6+<3ulQ(e
zi%jPUK@aIPPRtcf^YtIN3&uaOe>V6f{ud835JTU5L@MjwpwBl^|6BOa7qOpyz(Xfk
z%iKi%4G*pQ8ZcRYU`q2EBmSR$=?%W2IL5Tx{#t$d?T(#MG+wsg`=%dx6$CV$>kUV4
z(;nxk`x4kZ67tmOv9_MWj4rORm2XD;3V_e?_&l_tB))a+-F#ZeVxBKSEU4Yu6!rtn
z+VQ?Nf2hP_d&%LNFJgG@{u4%AN9U=6*e%F7SwS<i@26TjMt^Rt+jpF0WwDRIZ&=Da
z`hk;ugJ)5%CvM%%PrgQRKr!54N(0-N{M(Q!pSac=auet~aQ02q`>7=DH~QAmJzf^L
zvaR4TW%(ZQ_WDoVcYZu>`up)k?C=v)f8?C}57=#M@=dDvRp4JQ{yq|Er!T4+YX2Z{
zpE)jyl1=;@i#x>|c<~c}#w~i<zUgVZMoe1!!Js+q>sM>q7YR+=JF&Vj*VA4Sf`M(u
zDE=0Q;<y9rsKZ_m1iobxY$qyjUvnaFXSaP&4eAe4E;VtL!zQ_&!-^Yy_yQ*))tU*|
zA!DMsO2?_74h<K3MGu$CX|1>3vr<v>JDTKKIdE=FS6a4MqQR*#fvA54ia@@qT@D1R
znI&4}OIhjdSGO~}#L#y?=sEbiwS@3|MlT#+_$=&^`GL`ZOLa&^h}DWKM9v&M)7xDR
zg+$IMI=uAS?>-CkSA5@TO4_f;BJJZR38nL0INb4gY3cOCTByXkdcBsV2wF7B9Rg2R
zMtf5Q2$@F;@N;pRSy%Q0l<WDIWrr}$je0C>I?kw@DvRycH<q1=D{bGMSRJ%(@|vAn
zppm_7y@xx5ao@!FGp#k(6upAREuA+gr_=c-FQHQRo<l|7K1vBSG~uPnGx#X-nxrA~
z+Sc$G=2&V;rg_j;liHSbUXbS25#Q%ipJq-d-|5wf+A0@bNV8@T^E40L_*>Ynl*l2l
zcRJ_b3X8Y%5-a81KycE0eD|I4xMdMa4Cn6newrHOTUSO1j!&8_tjV9VnwXYVQ4;4q
zTI=fW_ips9adDha>%n~ciIA_wrLb>ulDX{3PMO0PmF)Xzan1Qkr14p8ZSrveymq$$
z!rG{U@YK#+J{**C`=8se0#YbEeRpNH60+8e$vk$5Ia?S1Hc62~6?hsOcRZtW`pWK%
zjrVndj3CCJ;{XKKq`}uMqwW{VoHw;nhQYA(DX24?W{`I{&maz$Mr6MbE<V$*$Gq&8
zGUOb|tJ10JW;`8!KLteR%Zu;MfxyW;-V}1LH-1<d5_C&5s;WGxeFZr)Xj|+@kx6eW
z5wKc6mxbDSwv~SST&W~Q*+@LHg*Ik6<;srm&}3r73>9*xLtg9A9Fwb^?8cq=ytWRm
zgiw(@3)!piRo+2US`AO%Gkw{E5_$p63Z%Lt5BeI?!CM$oFz?RTSMAmUX*FKqHaN)U
zE!=(TmBV}2J7Y~hpoLz9cJ9=;eSa-#-Zo@v1iN5}mBYPcHnKt{?V}FfRxIRp^JjDz
zvXN^8{!iBGYjE+CPsBG*$mOk%;V**y5D5~7cl=14XSj<U4n%<bXXWC2U!{Q6dmG=P
zrv6OGfZRmpi-^h^>mS|bl+JhKzm)Q4__F<7oR}crX!c~@^SYfsm|))&mey_<J=DkV
zxjCt#K;6Qv&@aSi<*#mr8x!2JjxF-k84OOD*J?hr&5a-A@m|=X=5(jc7$>@gq1rY4
zBcdv_c_df#7;9M&dZMlOga#P#+312$UPB9RxE394rWxf+$U$GH<v=D63$+`Q+BXJE
z^4>jyeq0$$a9ilpZAPpHF_EruSh<CeZj87v=^&WdT?EdA^RLUp!Hs#Ys<b%Ts%|vi
zBaoG$$GU88QGBMiMBSq{YBQS2*621X*-v|>57rq0mDLxW*0JKYWM}afhGpzW!SJel
z$mq5;eP-Zz9NlfucUgasQQTHes)@wE7u(2)tA+oI+uab#J*8#In%+gyJE(lj5}uRz
zu?;7AkG8-Z@z?>a7RrZXXzzgsgyKo?_98|Jt{sG2+oAXkHZ#n|yFo95f=KfF+F1>l
z`FrbQgh&aVIPaR+{;jxf0dd2SgRcmE@2YUxbh@4lTtDWGqB8_DD4&p__s_}H%md=)
z*#|iZ>Hb4Z4Q#!l5YKE^H}+UjLKn=hrUmIF&{LJsbF1L&vCeU~;_+4NAgx0M_fDFH
zLzYJ5zBOTFJTZy4lhmfG)20|rIk^HW^2g!t@~U;`;-tFaf04a?T`fZo);iDtaI`nw
z&*NEAcRFFjzn!@V1w{|#(U$FWafA<j?=9{m20mD27)^UmlLOI&x3RY(ou<%_+9A=Q
zAb5;Q%W7*K4j!oEnrLwMNs2^hn;w&VEo!}S%zLqCeD%ZpC%KY*+0w4*bCCiamy|Jw
zm$ey*2DEyy7~<R9olMM3*vp=1U6x@iRM6kPIwQuIq>#j`<I3V?*V6s933)HGinMjG
zv%Ev(P}sL@L*l)hWLWx>TwL?Ue%o5iJc;~uRT+UH9fDf|_EbTP&1-q=xnxLa7jIO|
zV$UBzy?1Sk3)nAH0;4VmN?e!$y;lpFLX>)!BunOS*jR7vy>=e&gKW97aJT(hT()UC
zUVw)`D=6-6C-2u8<}Kgs-C$YDB-lxg6nsq`nxqrIrgeaoBCTq8Sn6&b$=ez&K#YR^
zOO4$UQgC*Gt%WCe57w4s3hCZHjkZ<E=+{nYNajkU5cR!33%BAzJtmEf!L7|H2diN6
zfSNRnRo_ooi#kp&LR7(+6ZTH6w`vsc`sTH3tOi7}^L)wp$}H~#i(KNSVe#2{I7=vM
z<FO=~{x0%KAv-X`qtk)xP=L#l3p`gE)N;*rB;pr$a{SHVBFOHX3L#T>S}i3^)~`V~
z?XCXhu3WyXhA_lPmS(Onl`3ykdx+M%yF^`h((UZceh}#$;GF(N9vw>&&0~Mu4jID{
z7Yd2|8CzK<c98mRK#`s?#BBJY_uXR5Fz0v(21-fV31KPoXFxw;&oA8e&F(`?o3Jfe
zq`PT_!PqMyoOc=JxvjA9#BK+Fdwfr`4-S_kiKtfw%P)Db!A^}KFd22XYM16qwJp)v
zYmKYt$5#bcy}!vh!`_>W;eYbY+zoC3$!2gh368^Ex<{pWT1!aXuQQ3Z*f0^^ta&-`
z38r=3LdeG%BXTe3pyap+J|VI4yU8&uwW#QNN}xC*vwhdo_iRL1*W6Iwsr9vco~*iJ
z^jy80_r#c?vQa-XPyH_?^j#P$H)1oM{y>IE#``Hj4hBhtcY+-A8H1TT6enC%KSnT$
zNfyB2Rym20`RbpiqLRIr8!^v>l}Tpg`x{b5Zlomb`t#@toJb_p(KG+xV$>>I$$R;E
zCnTP%1o7Q?X|`VYvP0#5n-~1f+98^w!R4JE9OwOQ_^{Aa=W8%>%~lP{Rl&Z*O4B#u
zc7?YdP<9m-n8K)B_n-SUoa-B#V?W<u&POFIzzrBr=^T}e`?_@iY(c=dYJi6twD0Wt
zYR|xoOK(OHd?lEPFP<kqq^4wJG1?b9>#;*<IQ26PD*&MOtP&KUY0}~*ab<S=>y4Kz
znY-8X9O@Z0B3J$~$TA|^cNYsF-Xu2KyGp6hY2{o^$+A+-Eo8)sazAJ60)(6^ho@8e
zmsnrd)?1EyuSq0$_xQEpJ`NRH!keG8-<|M*Y_SXEWiq>oDi}p0l?khp?$or|c;C#8
z$T;wv$VNPPK?GA?E%LSnD&A0^0*Y-^bZn@$sSmy}LK_LePjjB>2kvLm{h5ipg@gT|
zGHSIZKpAm`Lsa3}Hd1gyVbuI1$l)Vlc?~h?lPd0~IQz*w3oagoN?d@#Dt9Zip1{9I
z#Dh+?V;h$2kCT%)>SlrOZbAKI0+@u)eZIBXpUoHXSyp-q!bO_UPELy{t)s@C;0eyy
zNu*ruR_i-|_-af_n@Z+^^-kPYxSp~I^l*GPRWp3sX)9Hb_I?hIAlY{_wjsA6z9&z{
z7y0I6!<?6!H{IPMBWKnK7G55!bYpQc6!sOigd3^|Q_4eCV)l^j%RNqmcNZ(WVe}G=
zVpU+s4mEgjS#k_cQcWGV@nKg~@q)r`;vF?^D7v6s``UJHQ$k67kn1bCMHg4*?|I7C
zsm^9hNzM!89@c6*Z-8|jogFks85cY1y<Z&6rSO#ZJ1pVf2Wi#QVZNDq*z;dNC^q;H
z%Kj`RgzaP`Pip*ALZyB0L1FN6LdbQhcF03@C&Nk+RH;Na!krs#vGFeOcKP{)6968Y
z1uvhVblu%hKmm*{xu8)6>gM1Gpm&4c7fBZE@SZVg!+$Fgm#XVWTAy>9d`~xo(f<{-
z>|2s;=Nl*`&50_kHrqr}C~_$L-f=^P)xZ_0kZyaa$QCwfKK`|*D4?Y<ow>Z2`Pe95
zC#3+7UB}kDc?Y|w65@uDfh<?W#vJcp;40scL7(dSP{^A#1KKHV8cB|rtfDdc`i~nU
zuVJ(T5?_u1DPMtWU#Fg)8V>m>k+KcLp5Fu{2X);UR%c~_;7mbgMy~Frji7~&FH8-A
zpK&$ky8!@7d_!t!Ud=+<a~RodcY?3#+=IWx1^L7_$w?|f;H#B=M7WQim~XZL`Rj{Z
zoJkC!dcDX=XO-8fA@)jn*x_6r;4?hIe03@|C{1eLT+2dS({;bgb^^tGP)2WXNAfo7
zWQFPSW;K62A7&m!TUqy>`m!1d|14Mc-s|0VkS;N=eBJx~u$69B!zy>8-*sX<djCz_
zLc?nOxlmutnHb%g8h`Sfd7^oVzAx_m5h*T~P8OZd1op`9n`NKk`WwsL>G1Xx@b=Ym
zdz$0h6$xhVF#()Flk>G7n+pb%lnLEg7OofmxH>fMx+>5l{6iczXuj!#?DcREzZ7Vf
z4orf|Ylw-aO;njsgDE@il2DYQ+hlNhY;hD#4HQ4f51CC2RX8Ws?V|2Wd~CRW&cBu-
z6;iKm+T7}O79<-f?&ul)CP^=f*#rsPt67g1X!Du1+$v`rT#3MNjDS@x^A1`HVd)68
z=Pim#Tq0Gbf{)dr==|o5P37A;HjgZRV_#3<5HsI1crRQ`1a-d+xVCuvaR#QFn2oo(
zcB?F%dEA${6(fMaHYGYaI{?1j!b5>%an^v`!j?&KZML`S1^&O;a$p7-OX{HAU=&NG
zjn)tnkEN4yZVz4IE(xnvbh1mTtEv4XD{1ERn8zJwm_PQeFH;^AjkiYAA-;Qo%Q7{Z
z+J)a)tEGgolbKt)p;Su6IFD*nyZixy%o>cZ?n;!wi4l<Nv~F=#=-o3*ytd|GiY_W$
zTyyzuo+b~6;WGc3F6^F36{kI;>wd{AKODusX9pzueajs)<@7&Y+hJBh;@3(MqT}<k
zIdbG@>4wTCx>;q3l)JvgX={x(eY&1!9r8pn{&9LH1nU?3GSu4QO}&RU1}CuJ<;&*P
zz<|0y+P9fD;@U6FIAUnbQ3Kzt%t(Tm`>D9KX$L|ICQPWMHmSkg;{tRQjGvx1UvwX{
zNF;@&RXY#83#2SKxaV-y?Y~^pFK<>io8(FK(;Q!YQAa#ICaxINz7<_=?Dd%-WWF%0
zo^+LzU{82YZOq}z!hEEYq}t#cQ7x-!#}K0Hso^w6Xd*!%MkU9q$XZ?u)kqr>cp|=4
zX+mDe^{C1AQoX?YkEGPN4o>83M54*lE|0?iZX!PQT$wgP#oaQAV!hq1`*~OgWyomI
zi`|gB%pHEBrKv(4x*-zwh>}+0JyPQa2u@Pg6?0LlKi7HtI!7hnnb;J%#E=^|l=;RN
z$@Tzj_M1#J9YcY8(@F*GZk>0Zidg!YvVAxPd;da;t8*zOQyt31tyMZa2C4{ZAT%3j
zI<UiL{^bC{AWA^$GtZ<W;;yDe$oh47PSo1*Q#DKi+kR8YQWp{9;j<Mq!hXVu*0s~n
zdhH$RVpQtsnrT9!X`yF+Gt2?ROvl)nX#`d~wiqXR90)ow>nw^qCqlTe4<nJ}G)c%p
zNlAvV$5l4sXNeO=ax*G1Q!NN}<1~CgVkMp5mZ*&0=W4?t7#_~&rthkSF1mz#37&^h
zrHh<c8R}-YkgPhyeL(6l7c^*DMv3^BpQAXWpqda0Q=0r`CfQsiE|+h}S-X~@qrIOb
z?Y-Wb2G8!f*?#tcCku<wJm5lR*!t!zy!`5kX<K-tnRD${a*s>Hpkh))>Cqjk1jnaR
zscM4(pKi)eA=N;_lK_<_^QQIuTr&nyUH}-WE$SdusD2^-E4nR(J*o+t$!sd9w4NV$
zB2E6h9sSgG#KM^)A%dEWCZ-T$a1VKS?94QM6y4ulPV4L}S<|SjQa5!8@<#3!zN;69
zAI_q~wg$~ojg#$L_ExM1mmV`A-<C&Hd57)+j{5=23lhF!i=Z*NJ55%*buF^y4HN;H
z|77y1_+efpZ4}Bj3{6Qb{WbBJu}q3BoAt(NbT099-L+edk6^cAZH0u0_SI^uFAYWg
zSoXF+pw4M-ow-eZ!C^#YL<XI!QH8n{IjzjN_-JF#&!WWS%SAPFYFw2{*{nFHm@OYE
zhlt3;N|Vp3+Vx?fDo&wZnxH8BeYCfq7kqzP-zyN&L7lYWZlJ#wk!{~}i#jt#^P@zr
z3VRd0vJ}e2RD62w+U^>9X3}*p7J7!r;(^=7(_mIor<{G9qcw%79${!zXOJY+q4DUE
z*MsIHRj8cycG#y2LAX~uliattjY@9<0If!k@m9SWMCY3lq~D!T#@ky%R-DCPtmZNo
zRB-SB0)&q~ej@7z<~mU~WNB<bvQ-wSL){9UCe{@oky~?Sm0hVOT>^;Q{_w@+r$EcU
z_Rx|T$|V|C&-rRujJ)vHrd7>r_Q92qHNF_aJyW;Bi#AyL3$$lGO^u?v;uW&7?rZ9g
z?W~@W7v{t?ti`Xzf;NZj`&Da{K&beb<TcLvG01#Pv;2a;U@5m10vxw&ryZ}GAqZGI
z(>mI(mvlpzQKfEy^PS86%gD3&o-~K#my+qsJ>xETLvEMqId&(u`vI_tOP}&b?Lonx
ztkHp4wS%pd>d*r1`r*cA#mV;M6di^D!X{M=ILu6Ixc1f2!%*7{Vi9GVSFqFm`htd=
zH`%V6KrlXxAec?*E}k4;Vk<37D(QQFi^g7yKQo3|D5b3za9oPvZ-Yq~AY5>y0-70L
z@l+k!cP8M?$6@)nP7RaV)?3i>u=N|Wb1fkUW8K8Jldn9T736M@$YT!rt~yORb5vYl
z=Og~?RsC?r@BMqng}66=JbK@M3x&$}Jlg8Kv$zHmD;rW(`|U)%uu`^Z$$?5n<j%8E
zfoun4nYl{T3&tpoj#c`;4J(U+GyR-Eg$(mH9qe!i?P~1-f7aAjqhjXdE{#5K3)~&`
z9AHj|Wk;~#c#w@SGfmyR(^FIn7Be0<oV4&j7w*C~ydiqNwk5oft{t4ltJM)MjU5j1
z$VlK9RS8|{tIe*3H4qH@L8ndV?(&Ayfo^!!qJ|pWR$CKpGN`{g+f$WbeIEO8UCvz*
zo)Iuiocv9)!0-JhvL~{E_oG#X=Z!z5iVL-@-I?D;rKpr?ALIp+jqF-efuiH+D;42f
zu4Lm<WnDY_L&~94u2%guXU|*eP=D@lgmo_Xw1U0n?#f`9Nf-VIXWX9uHsv%)IZq~*
zV>7Ngo2<ZSSqP=|&?nl@7MXlA9MfOxQA3Y``V@;6p@ZCsm3FK(4G-nE>fI`HQCnGq
z(@BAh2kKB82vn}E#N_t=OFcbpHfG54yvLsds&+9AsGIs4_EDm!FVzFz2`^aJX1N$J
zc!eC-r>Kp6h{<$m^V0+!&VzH=1=+@FV;(VXsG@DZjN}WXuc~e&Blp97dm7FND-RHS
zwC031CvAv)?=3hx0Ywi<tgKKs3s8W@q)Zz3weDPYYHJP7%{5Moxy&l2p?UWZT3B*V
zrZ18OK1*BYawd;BtTMo}%2B?Zg`2&>AqN+3A)Oa+GofU|vGW^|4r)5GPj4dc8Hgsw
zFvV1SrRf81F??AZt>3F$<0bh7ONV&e?@Me~dElSESMAuouMF7;O65$O3$jBidpKB>
zlyKzqdsX~%$+TJs8ytiG$j#0)`&3oeA#d2pRHr56Y=k^lHxtv#^k+U>3uo{{d{%H~
zl&<7(e+N8=9bQI19FYfYnpdj^+(brjc7J0|gTHX1MLX@8lPV@USq<&<L56AHd5N*m
z#)Adlfb>%$yqB%{vPyh94&t`f1E@x~;(_I2tQx3G0Ht-}zHNVg%+*PK_{G9`L^WX4
z<s=|L*(+7VN22FXV(MMHikovA*b9eJ!U4CDC)q?0igVCm(;7U#=;?rCB!*=fq3DW0
zk^LjTI=l;hR&zrs`Q-Tx5BW~|paPl-O25hp{REQr&Agxrm1?8zzf2M1+y!jv8+&m|
zAzv>95uSi^GCf%o`&IH7Pg=%^e~yEhs>pf?m)M9ZlPc$=0<U6jIXp)L(<Z5zobsI6
z<D`3uPX;JliGJjJhef~3C;Hj?A}_2f9lx+DD=_`#D+xotOw@|n8tW53^a}Mw0?y%k
z)$lAi_Cat|Fe?->m<lq_Rm>`x*7{exR%P<Rdr&za?@)FPjB!GL0nNj<IX8_A9#NVv
z>#tkH|AbXt-#N6`rZ20}`gD`!%U6Yidh<`4#MhS7KO<D372iHn-DHVV%%J~X_5D}5
zn9|O0-Ad1=$FtwsHvp=t$Y8VNO3F-(hup)Bncr+x@-R`Fd0|5q3zzsr{9U%t3<`Rs
zo$>e}F8TT+KXa$h&x?@7K``;DUFsp4k=-qeOJ{GN)~+s35}$(tyoUhecY-KNTtSD~
zdONZ6ov+%5K`*4^4J~(BMe1ftTJe;WK~`?3OL>2!Fb?K02cxp!oPqpC1#&ewC3C(>
z-Y`>l!AQykJ{KHCd6bG7K@PNAdt=Fl5!)6|;}L&N7#^lWpm&9TTU3nYU9zoN=LdR`
zGGsTMzb$v{24DKRnKAXTx0UeFUzCA4k*7AVl*d{vy1c2i5#*+aCBJ*!)^p$z=?983
zDG&29UUeSZ(S@H&N*lk#UodDM{|zL2fS4*4Y$N(<a{gssb92gpDhG{n-8~=)!``=`
zGdx$|*;hFU;9TF57(k=3gBN}&UBJ0harnuaEL#J`kl_pJ-sHH{Lm|NwmT?9j0F2wV
zGazOUKd-bNV{DyH&io7eX_;;veP)ala|ZAulDGJWgdDPuYg~u6a&+?In$*qV(^D;J
z*MVt;%8>Ps{rG~uYZ@UvvHm&Y@FC#^+_~umRo~1BNxU|8c+i~1PdVM3a{C)+zcxJe
zfQ`-=WH5-Z_h2AgHGT^z4j6Aydpb-?zX@?=M1s-x#mEbsg>8x}r#Bw02pp3)VHB!F
z9>m1T<bJrx5&ycJh=(`->SKyY6+@ki4Kv0Lp9C~!>PvuWJ9)HEFY7}0_)#<H*dah{
zFfai)&ea>^1Pur)eV^F6O_HJI3UDhxr@hx@3h>LswCs^p(S6K^cR_1PA$wf<<mH^=
zKGhvp&Q*T6%ov!|9X443V<o5V^V?GjI^gP8SZ=2PtD*iH1e%uVc?)5X)RT2lv!O#_
z;KWu}K@d)@q`;mvN3d;nk?Y=Zmd`yvc}bpth^jvgM1IW0%y$L?Fsn+yeHQx#l{z3-
zc&17-II$G{@u5oX?dgVs6rFGP2Bq7Klb4EPp~Z&?^$2)$i8@rQPPLa6>W~LH7zM-_
zub$6-Z*LNAK(GxrU>l9FNAeE{q0Z^Uta9TCy*$L%3#t2B8%;ctnbUAXTW;5{w?t$!
z;*$(c_v;M>r_cZH@0?5Q=T6BMY>G&rNBA$0Yew<oB=BsA=%HAb$j;N3VzT!l5g&i$
zgu4FqgSNctvaazhikqEDrWi^LO&D`FR2ab_ytF>b9n|xjgdZfe12RqEZlWp2Y$JiT
z2{x1t>eMcEeQ|NV(zoPUY0&<w!Y_kHX(7gb2__**`Y^_aX)@|W>V6}CmA!IS2^*HZ
z#$3{Xd^_r!c^zp+RjtXoUs02#pYEg}cW^7L(zUxwhuNK1x`1>i7I`|$Ros>dG3Hpy
zNPoV5D*uBjQw1*ll2?Gw)&<Ch_8>2Wy=rH&i>0Wa2{xZq!znT<jAbP&y<taEir(t_
zID}H&pZqPjP!zQT9I%BwU_6217M`F*;l^Xn2U;!3=(c$2?z9ytIVhzO7@#5D66f4X
zWNT6t>)Kf%<;L@$v;C|H?(n;oERIp3r}Ju!YEAfeRV~hLnhM+hWJXC#o#aW)Dxv37
zd2Oo-TT^j(k37Zglf(0z2CuNho7@1%<8ta_mj4A&R1=UGN2+;^LZj}sSv_g5@|~1!
z3G9F|7XcfN8t-y$L_}?N<053=fhyGUeyhSlK>>=0L@KY#*A<hDB#XX&B`qGXR_lm(
z%{X_<yg>`gJKU7BCQ*+nZzEZ$lb8Bh4u5uPMyXkVJID9eh>7MTj8g6qc%&>a!DgY3
z%qo(n6?r+6Q#EH^4KII1>QK%x@w+O7O7KbT-r%0@l1$-Z>Q|FTDP2#bL}WQv0u{i^
zL{cz}zMuE6u4wNH((0e=W;xTxtc?o(M}T;{U|;Q`P};fe=Jz?7RP33lJPrF8=#kNq
zyviw0u<vnu<JeWSqmf~#)yp2xgd&7c@Brm`l{JOqCp9<WB5QhIMi_;1h7vQML8#)8
z1Hk_Ar5KVV?f~J_ru!U)Y!gYiGpmSGMsK?YI1oO52|}oF^!^rcK>+?a=2GewoYaeC
zqQo1Au;o-wooNtC+K}F^x3yf4a5b(sIl(K4{}l+)nWwaoZJhRMm8HOUe1O1KP6y0B
zT?sa)r0HN5emkXY-YhS!E9P=vs1GHbS|x%ac;MZpt}MEJ;~jDfJQ?)uF8k3EIlCYt
zQymqSYg=e~%f)lW$=P!PuuQetiT7!Jb6p1<>ZavjKS#rL_gz>y9aCn<;wYVmO2tSX
zra+5!!^(}jyE9?i3k{tDcvBZ$Y=cB+GQ}=WtCh{MJ6@{xPbl8|iGxSn70K~kFDdG}
z(*>xTM4u`?XBtM|;7D(iqAG0AUUNHRq-UmvBvW#vW!;i?{m<AiPuL?0_3oMGmWIOS
z<et0!KHqQbYaxHKl_prhP^|5dI#l0W0m_zcF>;P3bTWd@MW%JWwJF`2n>it1O2TwV
z9deZV01jl&{5JHy-UXIEWKZ<NfKOLBXbhT<_%AR_(w+%M15xx!yEKZv$wp_OjZFH$
z+CPyP^v~`*;t|7Y06QsFBvvM=LtS=qqFf>nC*U0aq`1&tf<_Hwt<Deu!-P2>9s+dc
z;~P4O-b*haEQ|H$8ne^?B`Fe@To@4<qzxpK3uk$4CV#VK#vnU2Tav}&up1Eg#sjm|
zuBT%ja0~teB9KsTwWq*GbU=5WsG9-13Iz<(%?BwU&P`zBpGa_xOKGc(HkOltMTK8N
zD_wWsr3%_9J_7#^xfB;doQhYQQgJ-dwVKeHK2=kyB<*W=<g;$Kc#}_`_5*VqEN80Q
zTYm-+Kp&Z6?62GA(c1cPt25?ZRra;nkCF-Sfq`GQdP1%6pfR{{U*4cwUzjX~*Y_D<
ztO1gav)`P}Nono8*4fx-cT(|ASLH?VZ!{Pc2g()5=p0i<9MJ?OYB?aC%c*tt#B*>F
zWq{5z^Pf1b>ZWOH`ppd==|jZ|3oq+M_XW5_cK)4i(6Fy@kuyr$dr}=gcCtJY2zK!&
zR)+hVPy->a`^PS*%IrL@kc@If74MzM@X#${@4AVJ%C=+aW4$nT3;3B@^X+#>cR#A9
z;~RZ<Gkn3Tdv#Zkhr8m;!$z)!NWoS@PzAkK1mzA`(EGDx${{p9F7JVlXuL~fNMm8_
zfM*D$1D+FJrMp|V;5=5FksK(d99LU>jw?_Qbf-hzOz3;v<YB+i_-b|qEQBbCa84DP
zH!+YmxqawNkw6K?)MjFe(sBoNciSPN(*LAvV{QPsJY0jo`YGB6Nih*w=A9s?77Jle
z$hYQ&n(-Ybe}S<3-HTm?tyK-|8-jk1JU`~z0Y`?wWbO~`dmm6SvO)cM4#1@pQm`jH
zh95c_b2hwad&9wqVOICcM^s!%ogp;d@O9NeiMlzaXbmK5La8I8cAq71>T6Pp&o|f*
z9~BU(d3UN}4>JSE9t`)h{tgktHbXis%nxdVjMtPwPM~j2U@&*(4J2z-y5+bFj8UIM
zpt7~0qcs6NL0{tU5^O`r5h^)svyC!$|AqFFl!%JEb~3pGJ^vM_-0RVwtAl4~7epu>
zB3%8#{lowkK^v(!v0o@dZgbeQL1N>f&1Xu#fkLSj6J2f4OWE#;G!a2Dhzh=uS$Jt`
z&G3Y(d)xvnFYH4f4ObHJ%(RqRvkz`jaeZ@{9eijVvhhZlwv;myGDEH*YxbRDCLZ%}
zWtOVBkq=xfGe#6N*Jf7$aWK|gbjqWxv7!*Z??mf+sRMVrwCdavx9m9X^~H9OdeJv$
zBTN<#WJ5YVS(DD=`{`0V2O$rDVj6$M(^TQQj@?%A{k(2+gTRwQv|UwU3rWkV1$7@2
z|G>KWx#z7LfQ7T3`k9!E|8^+wY@{IKTnF6^VgU}hO@$71N8?x4+X(OKs8Y6xd`caP
z#lG*~Scqxbbb*`etM$kIwsWajqv|AE2($(Wb2IR0!*-WMH@dCUTb+sDEI{GS?eyji
z$l=Qx*rxz^#%J@@`q!JIDNZzoKU`6mpLB^$=@~33bkYfe*dwnaqjN}}`GdA;MN@WZ
z9UDa2#A?&{d-xVDVHG3e0iNR>4L89NH7>js^>?Uw1}ACSux9pB*7OJS%J;{#v1DGr
z5yx){WK44yu$Ju)U3>P_SB~hg`-SwzW930CgWh0Pvn)=RD7uO{^bCXkQ^TY;hj8!!
zBGMaDXjn7all8_c3GJC_^{xa=Rqo$tNabbjfM_fJb@sO!RTh;U&tU8JO7((~lihf#
z5ObX8ay*cX%aiF>ZDf;VA-ijw98I5gyUGBb_Tk3`{khr>VF+!9NW_cWSgYcgrY6z}
z`lhr1{+EPK8R1llY;F@h6R!1L8wBE2qteN!z;h;3Tb>}btq*1vx+pyz)7_3JVT!tE
z%s6MlAfz9u3k!H7itugMUscjxLIFuJ{W??3u|7q<NFN|fw@UJCy;j!GQOYE-{U28c
zb5~_}E#%9DtZaCZ|BMyRQisYNGK|o7wx-OKOY{EGUSy$ki^5Y%;Qd!u`!|HJ!{*MB
zVX=cCl$CD8$~kkx@hh@$FJI7ZH;F)BDn<x_W=e}NdlGqJoOKATg7i!~NjocS5G7R+
zFzSP3JAm=pW{DIx+MY^NhKa09jHVw2jeyWe6~qS@-C)xaE?0?HO0rKx9$c9zE};H~
zJz$!Xo7y;M;GKRvEzvfbEw{{qn7Qh}<iIx{>0%myWG3TyS_z0qj`gUFBP!<l+BAAh
zsU%AuJmVCB@poV{cPenU{d#uzUO8uny37m#iyR|v{j+W9#R`WNkLlnxdsdw$lwSkt
z&}>G6SLa~+C1gW=Fm8S?g1vfM7eW>zyK$y2HCaPI(roK^miGrKQCV(2Tifq(a&_lK
z{)|fSw7YlJhLo<>5M9j|X)qfP1ixCd)GbQ5^t~y*tvJE902M4e<X%b#>yM}#gGLbl
z4LzqfVR_YIvSx9Aj$WB@YBcug_c&!`QKSfjpFFz6x9y^~zx7^8i{PeIxLjQ|EihlL
z6ca@adf+`DQ3MKb=7uEKu9>)X)`hH3A3vMJa3d9jwTl~Cl55J_K`ZiJ%FNd(60xa0
z#X|=34Yw(xAl(gvmc9H=$jz&oEuZsV_@i#iIOv3inc`KM>Eum)sQy9)wvF&evq+Jc
zTU*^=BQy13Mq?!cD1pl@BVj$V^<~#&4}+%6*?0HBFxK&*NyZM>>L|V*fMoNXzFV;t
zynkjqh9Cv{g0|s?hWJ!^BMVJ`vq$c>-zbV@FuceuOVgdYSE>=1!VAWh%Z+GnWPB^p
z2=Mi6{^7EAua(EMv@d-n+2NBBO8S|$*fT)NJvbJ>g(#NdOxvA|qa@I1noT)5M@n*@
zB*hwa6-PWy(Odc%w6&iF8y@e*y%Xdikilz;LzqdI9C0#qV75<DV(1E43kY-LD+{>O
z$v}d>?MZtyi^uy@Dm-@t#QQO`T)}%e6@$oCjyx1^FUNh^fUJ%NLUsw$-ow*ygmBT8
zWAU6&#d!S^aak=AH!`*%H#R+DzELpcwsDK1q-n7sGp6MgR<3Re!!{sX9o3+if(dg=
zi2=L5HuhB~)u7<=a<bZuIaOv~V;rpii3j5T&~8p=DI@2IW}HJ`3sV5%xpFzGg4X!6
z=?#Q>y)8Hf8XvPAP@B5%o&Ht@_dZ7(a))*aq#D%%FmD-0tNT~qd>~I`@0(NWCbwUt
z>5kH$Kzf}1UC7}uxIuhuyC~CBXLdWwoBdH6A#SYxPd?|>6}_FWrI6V4Ai`jP!CBf9
zcc~J<*uCWzgcSD(JxEtS6BqrMzOIa}P<M(6JiWR*<2t%aa&^XZ<l!RS#0>8&E`JFG
zB9Ccd;_HNf>&OXv<fcOikOQ41_rmu0IM3{a>L02^Lb+~?h-PdV1!kHGQ_oaUr!RRo
ztS;U3rEPGY>qQ)zcz@C$<qR;`PAE%JR~GR#D5y8bDX2dyuYR$WM9Ntzfjb+gx0(jF
z?ObcwolqeZF~V~*vRVn7Cq+;*Ar(N!wubF>Bv8Nsj|8F>9Mo=hi2y}j*cD#g!;s{0
zj?<{DuGBwVvkmEaE?w6DEQoL~2_wyginrPofxn@5Z1d0({H?M>gE&c3+AHx<i%ehG
zs7nxj%_)dj*i6c4Cgq>2>twy=jk($G8pe<a`PAq@lx!x+#&uf+2ehBvhsIlQTjno-
zWUE!RzuQHTE#}eKTg;7Lyb7J<IUcut4E?Xh%(cQ(a=c=c&xxTp`HG>78tl@`#_Rux
z(mhWLos+AhHOarIVJ%t(3hx%Sw<oBBz@a`jm(ZT_HL7s~ZwtISJVVwf%S$55dre;O
z?)xrJ#n-ex1;3hC_05TYnTQX*w2)EQiKP=6Iu|nh<N_4DHd<>N(Ai}e>*vaIOh5*o
zc3@w1ZyVV80Fnh!;|<91mQ&D^=?C}PV}k%>7SuYdhp*(>tY*A%;?$cgy*u;Wu+0E)
zPH;jSfgJ4SeT$d_832`A_wn+Dqn6{li#8aUo*~s9p805m>oTD@xMH$%LD32Q>cHZw
zJty;?<9T?7YwXgMK~7>Eke#vG&5M4=TNqsp)U8<q)jXywUIV+`)`c+8U2Gj$SCehO
z+qR`QzLa>pjAQ{vOjGxp*9=xH0r4T?Q1Ka5)-B#oG{|xfeVABDyBheWt!zvsQ<{C@
za&goK(*|7K$*gv3j^L_>m^qGi%a@$|RO67E<rS^+tmK1gT<w*a__hNMKZ|jHC1F7C
zZ(B_xvKqWMX7}p0OV+RNA2#@`=U=`q?=;$K*+mL2NMViUdVffB(HbJBbOFJn4nVPC
zi?-xJvOM-k`9xDotaJ+l_fe1}gYDl-gci{LMbVI;NABd0j01u5FvZ6*M0?`iY%}2%
z5hEW;Wj<Fo-?%FqnKgQD+S|h1!HIdt!3mfu*I3XK($2mU8MQU5v;gwWlq{MuM`@{_
zpHU+m!45Zld}tElf=Be7b<=6AaMfz6I-oX|Aodfd6{{Rtwm14t10XHPxa!2JqziX0
zNygCBwAUxp%cv40M2GC*C(!dOpeE2mt1st>ZGY&26U(aOYcOJ-yE!h2_jXJFf<Sl|
z4uryInI-yxMNs%ePf@D!{H8r62FO_KUlhO<HWT3p#}&*-bM-gZy`Pz{KPXnaxv-Nz
zW(ephngnh8d{(u{$gB6s9t;NWF?zWt`HcZE?C(_-w`w<eB1pd;f&QuE`7rC(U*%-7
zMyg;VD|V|F^`WPO`ihmXY??{l=1xEEz$#J*DBLvg2==LTQ-Cg*goFT<vP_^-wz4}=
zB|&9B0VsSA#4GxN?Q5O$Kpe+Au{R^Ts<k0=8*toSRb)m>LXlGeI<LBD!Y;>>qP#5^
zbS}th<8+Ya#yJ%$%DMiw5-V*Z`eQG&x(N(>&E3-m8@8Q26Ecm}U&uV>wh-AIcNH+I
zkI@dcho&_$2MrDl907`YfXCt%00ZRq$6vK0zEFU&z9gcgK+#Idz<A%EMN2*XCE~)`
zTw#y)ql)J3yY<sfE7$a@0&%)gG1Qe>_N{UMkScl$U~8?RJt1LhM(F4Tbf#y36{&PH
zy;4+cS^dZt#knn$k__o)xP5$+#kgvR#;Qfd0xb?*{8&hdV_+-t`PUyFY-P(Oo-kgz
z&az3Klt0bM(62_Z{>JC0JBl7}+k0pbVvdRnXWwi%JgdvRu2UYX!Y)YpN_C-q6mH5C
z@(4pM`S%^rgMDuHOAaJbl7z4ig;3IN%+exj?G-KvW3$(rW?hoK&xvi-+xjGk4qM2#
z0qi4IAGe}gQ<y$6M=7~$NBimE+jmV4AO39B-a$B!vQIGF&q#A0rt9sONJ4_ve9*68
z$MXyu3))8pjobq$=Z1>>9(WE4p^hnaz#6<9fmGn;tR$@jKfUn!gxf;8=K5JUX7SfN
zRlk(>dhbWN%~jr%!E$`shh=|p1^e1(;r?Uz%wLjWxUVll2#pv-yk^mvxWq%V-F$lY
z;BQ0sa;ggNx6T9?l-@>i&t*(y<I}ofB{{<0C6($xNs8;Z*MT@CW=uWgD<+AMDv<JQ
zT%DAyQRR@sa$0HD<*%nMr;ou4Jv^+eU+rFauWtL~&LLNoy|$af=5{c=eV0WHS@U%v
z`?@?4=Zq(LFRp>mP2uxG<{b91)NITx^Z9Eje>*?<3d4%bgjlU3j*GtvBJxi|19kFL
z+|iCFmE6x9KdGD@wY`L&Y6*qy7HZRW*Ty0Kql$Ujp*~NVXQGK}Mn1E!Ip#KuRaJ+7
zwNK5`?Ps>|8oaB6j>nlq&BLSdAA`a(7GlCO8V`%kO^m(eoa{RvaC()0;!;i=H39u)
zYfvB>|H;VkTx3ZnW-9!%2jGKaX)Ko@qD!D5P`P6KM3WS23v@`$4$jE$%m*}+#%rxY
z7yHib2=;iWl6GS5AETMQ>r7O1{6^3MR;-vza#F6tXuHU2%?C61Fwge6<IIhx-zDhL
z-sjLhJys&)x7^5;rfx-^EAoPJ5r|o@R(uDx4bR8e&4jz}n=)60xX3H3aFzM)^juB%
z0t!oea%aEShBF3E(2dkes-G#gGvJPlU+S$APZ$Hs)KXW2na6>&aXD2H@hDUn*I<Ie
z%WztZ)4*Z1YccEN=p{FJ_H@#rIV-KxemDA8stRAQp$T4czRQf1A?Tg{A*&ECDo?_!
zC~I%8xGZdZz>>=;{rI(J;@HFroFe+Jv%g5Qc{s9g=~41Y1f5GlDa!Xr0Ix!#nT6fe
zvL{b3_U1NO4>nAe<PSy81S@)+@&!kih}Sdand37?u#JySV)S3S1iwjsIDE-S4=Sc}
znc?*YRE2<^ZJ;jzwfGz9?0v^la-IzQ(g$+l6aFk^-alk7L7&!P*wGZs+<W+g{i)D?
zyseeKOu8ZaogQnsl(vZ#AP!gB!U);tWH$HD=Njz11!^8YnTAjg8AQI^16%H*m?l>S
zv4mjJ9d8w=-;SAz{#qPDv|`~?<>IYswO22<(AqtSI}Wb$ktD|R6j9DIXg<VFGXK1(
ztR?arFaB~KIo@RdD|mOGoMlB&@7I$as<6t<b&sy3J9wyGL*mS3%Wj~4;>B0z)thxc
zh|R{T&2sdNIvm`i%EMxB^0e>GCCfb9eX4#>rQ;w`kM>m^5>3C^a+pIT1GzDGZnxkR
z*I>Ls2nEB9li}$6OSy&a-9kr`a2jLoii{R75zRofu5h}UIdRyA^nr}envNhEcW)kS
z_tzP6(~l=!?tPrT3uc~8q6HtSG%}z14@qE;_v^`J$r+q!o;d_FCp8=h;|}hM-EJ%g
z?KKZLqLjU3rG1!bP8P^@I}LAJA<;^7so84MiNuB+Jfzpnite%><i<q@?{_mVlfF2J
z{3UW$;|^@T;jY&_86@;gO!nFE`AY4hUwNSUD<B_1<3CZNhxtn^4bR)jg-s--5R!x!
zjQx?ub5x}7(FWJil@E?D;5VBa1S0$Q7}xMxQR(Y9kvj0dosnK4Q^^mHfRWuG4Jgw4
zP3r@pqfphL{`j`@%n#SE+a!8M1A~;!1!ZKRNNMOx)${XRsaa}XUAR})rQV+&pMc!{
z`sNR8l0fdPv<d!=bt!D~aYhK~%WO#hw-+IdgzJx|R^u`7WmS;g?RUJAkZarml5--i
zH<rJ=C}-UiT=z0WzNxS%u#bjTB;re|J>{?G#A1Jr#onL47$q*4^2@}%6PIID6K?Lm
z?Uq!Kklrmv-B}q<Dw@rc&8mQAZOvEUUJVGsKH;_?gKD>i?}F4t4ZGjXgPjaEev>vy
zQ!EnG=SIpl93NuRhpd9GBQFhAJUnzL{c{Aa(IB=9f33$`_#`1JU(2UDC3MWEaFwK0
z=rz{;*2|c9wmH@Rap%16S9k~a1n^ha7W#S#)8K6H6!6r|?=@2Vx$LoJvy7mpqw{`_
z19mba^`fZE95=%+C7H#(i2nJ>zRUfEhl%Fc7A1e!hIsX}Mvxv|XdS_t-4(9A05F}x
zznPAA0!UmAs|H-pfAFNM<hH8y^^cAZt(G<fH4UP2CGjm*OUOGu?IRcbyk34Plkn9F
zLnm7QguG+)%sv0hG?|T)PS0NPx?~*F`t%4k@$gaC(JVK=E7%=tFur%p#Hqad&O)~n
zlK1V(3kAMcNq7mtSG5zbG9KLjz>ZZv8v0ZHtHht<hUYJr2;KJQSWE6)KOfAi<#20r
z6cBM03tq^f2;SN{F}<|V$*U?mBn!4Zhjh22_bMgabb|AH73b#hfR}Nk=P$v_S+!>`
zs(-P}Ha+^{;!!b?Gb5ErbzUWWgkKgb4=z4mKcU|9`}u*ck2cN)l?sf;VT=%Qj5?-i
z_E+j3e73=Vm#yBa{c>Lnh4%;;yx*B0Op_BWx*ZmREwe4Yn^VmpyXVJFdE4fPO+8G*
zi&xnoH^;1&Hw5)%hw{pWG70i{X!~}<>8mQUDMo%*KF6_4+#2pTExx;8Zv3&)<ATvb
zWcC|)oCB(7&W1#IUUCG{nUuZK8ez6>msaVGzHYN=nE`hV^=X!HS=hT|1`{vu#9qD0
zQSh-$b#37+;6-mwHNW9_#+OswIXltSRQPA_M~9?uB6{bo_B~eGsyqxB3r&+Ow~d6A
z8>Uy}UKeC;y=t!_7rf+4Iq@$g`_1szdpfnoj+(a(`D^MsI~oxqWdw6lr|D4^8`%F+
z`flI-?+u{dSt{;6R=Bd@;4xmH>+&Y@Hxc&xnuL~oIgNj@%U&Y0o{FArKXA_bb|8#3
zfA{1|iO9&cN7W@z{d)8#3-({YJV~9Cqw)7hCC;Af&H9e}uVn1Ob8jYoS~Q>h^0>4t
zkB~gH>hYXEC1WL!EA)?q_@AymxHU5tFD{ndtuOeyXz5vzBHamH9?4joC)$+JKXg7^
zcB(PjqYiM+591wMo_?>HyziB8acX~%ka?dbo46y`m7kiLZI~CZ={w#k^^>}tn$5}9
z{4d5)m+VT*9R-6QYpaPl9-eCaJZ5xE!)Pxw-or@fQqjd2F%U4k*{b6GdBxZ3w;t{&
zLWVv6Q~`ea1|pV|z^_zyzNP8ve1iWY|3rUb014RQ<m~l2g@2i8nKH8+weU`Yp19})
z*a=JY$eW4HH@j9VCa(lOkNv*6Q(kr9^b5=RtD@_b#GAF>#Bpx`Jd?K<9kZTlyr4+`
z_Pgcd3=suKzhe$kSG(s{|9VS53V^P2mkvA#J%t6ZBaC0Zk<|hFKRENbYw)5tud(m3
zzWclYtN!Q>-u&l7K#%No)%JhB`+HX{uZk8Ar23y`VeIk$*IohW(^stT(iyYqN!y6T
zR1BZ=0|w0VYt!Y*e^BXrd+znS12fvc3sPfZV%;w=Kgyu-4uNNij<{9@mV9q2{`Vr2
zfg4wjkjXFaOKS6Z7XjeO{pf^F6kadlvL+QiGo2R<UHAL<zW=T&|K8~Pf8E#nom3f$
ze`iO2P%0Ccv+%vp_B)}B(f>r*zc2MFQ2+3~{oqHpHU6K{_5c16pdu{ucc7*GnI+Wq
z8nNzVd7NXGnqTaV+BjJ(nSEL<yKlhphV|&lCr6ijvtoATg5QNUEcMG~{(2|ga&p1>
z52GmYr)jsCBQL|iE`Pt+ktgVKB<f7gpf;rE=nK!++<obOe_R#Y5O*@ZpZ$UP7JAgL
zRFD6PN#nsS>(=xdJo3DO+Sp9blbU;af0$FmK|c4tsM>KYb)P?0#uu5USo#;?+kvC(
z{RV5vq?e`6@r`GvyGOmDq9sWGpv6!2e@Q+m4@={%|LZ35_5E$H{GSICbxm@tPr>fQ
zD`x#QaW>s(z*-68oW0@Y50Oo^>S@#ry`gX_{gaK3NutxK^U#s^Of`$6e%K;u;@M{q
zEyEnsQvuoCFP2~Bcqo}M-fDody5tro3bfw}Ju|s+u1FfI_`4|T*0%#Bu%g+%4A`Gn
zL4{!AnP(RW^ZyF_LBRd5`6)jRwzu%>Z#e0<*?k$FCx7L$)wmN61y*!t2?zPDjJGcx
z1`uT-_^i(B1&8FpxZ4(8Wm`{Lc6}^w|3c1iG+V-{9tYwTqFRP1U`6jjy+Oa&a4&U3
zj+Ld-wEmPBp;cX>7&OUx4eot%jZv3tI!e2$SRU$){N)n%8ebuJ2w2gSHag|#xrA-6
zyq^b+|GzmVSFX;esXI<hH!NWNV%^7s6f?vU+i#58HN-cD1ViI*1y=HX>X;{;Cps3=
z#*!AE2nz?^;oUx210AO3{~o&jKc%3?Pl?i|?b<PUYi~%?K4PMNDQN-|J6wz1^1^ck
z)vw;HjAZ}MZ%yryPS20j_!$OV|3yP|>5+uFB?VC@K0*5jANR1BlJi7)VG=vGyeymb
z7y;?U2B#G2D7@VwApUoLDLB#e!^$60gnKu=HJ}rnoSRC7!gIh};F<$|>HqmH^#AJ*
z?9b9=o(uloYEZ||gYVgg-M6jNf4XmMpJlIpqxOHwsqeVYs6p?xm;d-em5B@P!(A67
zKl^=q>*DA>@SZqrAD?vM|Hh%|C_HXczOA;6w)szvcIC<xkwt=jmdtZviUMQan)v4b
zM19UX&6$3!<z*N7@73<ypphY-I&;~!1kAkLrR|N5=Jxe&P0V<X2hR_t5qtYp!bQZs
z#reh7OWh=U_DYo4;=Hejy7J6BcV1Wkj4M3OQtN}VgSi2Fi*J9EMqHr(ulBAz9IADV
zBf0IFB!m%iiP4Z1vxE>rO_9swG8JRUiHV}9tsRDDCW!~3hTYj>Tq{Eoj>>J=?mLZ<
z5>0N?#xoIRTn1;&?Aqt@Jo`E4{Cl3g{$J1fzO~->{eJInz3=ZmdCu?yWwPIsIA|)%
zd;bn+%;42D)o`_s#mwJ_cg0_9!4*XbS{K_YtbK<T$0<c-`%3Bej9Vu*_xB1;iTpVx
zqI1OL&Pqi>c%g}Zy|ccJP5iYgGp^B{Gx(ELaq5k$kuS(OMW6dOyw+P&^G?*<7KSwK
z?^HA_3)z<?<7&G?94D{$#Kb<b1{dLGFC^@lO0IVa^Z1pYpE|u>Tu@G}rgq|8=kY^}
zKGlLKEeJzOaCdjj4SX{)|K}Y<n*a9%X83`|Euw&99`LsrUD%C0^9t5r``zY+{<E7U
z*2D;!DO*>h*-B*umTA34a2>2`=!mq7?6lwaqtqz?xwaBc^FG=q%_+>s6K^2~fabcL
z0yIqn#3boaL*MP@V_u@JnQx6lG_603<t7UQZHNgmjIAe5(-DxD?L+DjD9|1$jV)Rh
zd)r}>NJ#9A;dNT7PcoNw%4e|I&|H|C=V=8JDBH-tIO>)YSp4i3mD*S{CBEMgUmWXB
zD`+)r(e^c-O*?5)Wp`Fz)j$fP?n48#rY&IN8`Cn2ETKB-qW6vN&d$3&H$#<z=P&1D
zm~m`0q;F&_P5)5R=KjjtD!}vMfF9_ns9&9Ztt|0|gB2;`)65devEhw_ddI=oUOxW@
zXS@fei_rJud5i}obOh4GT(EVw3ct8(ei1wy7JXOVF0{S&`tHoedZpa5bRE^=Pqo%Q
zD{5SmXvMufu$Fxw;1jnrx#m7j&Wj6)Awe^?^dYgSBvVI3C`{4+SMSF`IXY`7i`ZI2
zxrcT*PP^o|(i1B$?C-{0P)e1RVxAayX$_ha-33)LATka&vZZ@vdC6~vXuGJ8tI1IX
zNrXfN+0?YW5|kSNhTyx$3~IhSQiG+PPpmADCeYly7Y>?nXWI)D;EI~han9}183O=?
zw>p`w^5r#SM0j>ktVxCrzFcQ`_BuOC)xhsY71ZI<;zrX@HRxr|9%jzUfFN)nm+{uu
zi^Yqe$nusR_>2;7b1wP6-_hpu?WC@gTA~`B%1J0|4l)BxWV(k#3irCK^O3lC_=os?
zST@~G3p}=-j2#YY&HWY#9;g6EJu=upern7))*#QyU3l*L%6n6ylnotyq9ILi_p8{6
zAJs@+I%LLn?LI1(&S=E<%JZ+85);Q@iYx^~P3_xHvbV53#<z|As7ZrbJH)}0L=xLt
zoHJ2MU|ExD2Cjzgk85zFLkDy5X5rqaUf*fZQm)BB2|<UpiQsaEL*}L+@*L-kLyTW|
z6*8GH58p<5<gtn3j#WgU8sX(opGSIMfOZJ20P5(KgQYekg_kc0x3_ZcIQ!<r>k{WD
zv!~}1BIofk6|4A{|FF?l%K8VU!*1INfeXO~&|5!}o_0RTh$)#lrml~IMp8E758zWh
zLh6l}6Nls;O9~xs+A&9&)8R|SxTYbB(}`3}uPVI%XUM||Rj_UI#QFJJp=Wid({#w^
zqgn=b^%+rR-^$QbNPQP%H~>!J3@x1t<p>v`fcd^LNrN#Iq6fMFV5kJj$$LK}r5hL>
zi&ps>h##on;Mk?Zj9)}i;T0Wa=B)`!WqkCdcJ8womBY&Dlo1>>hra8Ema||ga-35R
zx{@tk^lFrg?Eo~1w5=v@wm*q#*ts8i2H+T%`>LOZgvu0|k_$@8gy)yWg(!$9E}@U9
zHD<D5vq0IKX1Xa&O^|XWY&!#>kg1rr>~IE9q+3QvRAb3>heE4;6;JUrzQ5*Us2#ah
zZuk}dJ-gy@oGBULcr{3(a-Q554kAZ`&>%p(fHskYOlYcWTT4a181%Ya>RPU$IXg!$
z7fy}u&*Jk{f}hzzqHI*u)ijK&C*;y%SjAio+73Ujs`UcnQ)*Q(tgrm?ip8;)UL}3$
zPt=95vd_P=VB!D~t9v}0BKv-%WX?uEjD9WynjvbWjfU@Jvzz9Y(mpykJXStO-vhnu
zD%Ije1~4`4#vZq5i#PHq4@=_U8Njam4ytU^vn7RGEf5J<AsS*Nh*fBT`Bn^E(e5Nc
z3(*9KgGRRbF$roCz@8J%y_sVl#6kA~SmxN^=K8=QZjV9N-8PT=XLHDx?s1)U(dLAP
z^S8xdZmR3_h6txF>B;*x+g(yc2C*r2>ZC5((t|4d>#8ealn*Q?tdYMgNC;se4k}+H
z<$Qv}?UstAq&nm|cFV=x@zO^&MTgREWwrv7%Ud!xvc)kC5v+JE41P<yobo%@<SY5=
zlb<JNiSfaA_Wvsg*9I+Bvz+M<%ZKh!JG*9#{uAOo3w?krZpFmh1@ocGJ}=D;KO|@u
z>AT^T+;~OuASG%<eu%%}xrax8b_vRP{u$VL9%N<-`@x(45P}bi(NYQ(o+M`JeXbZ_
z_Y}so7DopRSHc&HQ2vn_r#GX1M+d)U9S+10nkV$9PVWz4^Eg6mVYH%yEFP4YO|RpV
z1j?nQB>Q=-_97hA4N)YBQq`%3dQ@WVm+Kp9C;n~0hr{7mg+J4VKeN04$Lvl_TckZl
zs}m)_tChcHn5*osx?(zC>p%WinjsY44;4lj3x6;L-xO)2ab<d9--bPQ4Mstg`O$$a
zL~+6<>8_kJKjGd43h4>DrNl)oZquS99BZIoPN;`u?M}CAT+Xt*$UW{UVv3NL0)AnN
zMASY(Jk22?53@*iL8x5rjo=mNu#WsSZe?oMXjAyYryWA_BN;&(2L=hyErahYCi?O^
z^hazD^|Ny#2B)9f*qW@y@%y^GhAj#Pe>&9^92C0k#l@pXQy>Ar!_7@UJf+PHJ*_I-
z`@>?`^2g@u**>``A{99~5rY``P0LOS8t5#ds-);dsNF!c%7EjtP<5AxeM#B<UrXU7
b!811GOWhi2AHs?Y#h?r0bj*?A5RmwHM`(OI

literal 0
HcmV?d00001

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 677a5d034..10db90768 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -18,7 +18,7 @@ jobs:
         submodules: true
     - name: Install Dependencies
       run: |
-          sudo apt install -y libopenmpi-dev openmpi-bin libblas-dev liblapack-dev
+          sudo apt install -y libopenmpi-dev openmpi-bin libopenblas-serial-dev
     - name: Build
       run: make -j `nproc` verbose=true
       env:
@@ -28,4 +28,4 @@ jobs:
         make test
         bash <(curl --no-buffer -s https://codecov.io/bash) -x gcov
       env:
-        LIBP_COVERAGE: 1
\ No newline at end of file
+        LIBP_COVERAGE: 1
diff --git a/.gitignore b/.gitignore
index e260c13cc..edd9dec2a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,6 +27,7 @@
 *.m~
 *.nvprof
 test/__pycache__
+test/*.rc
 
 !test/squareTri.msh
 !test/squareQuad.msh
@@ -34,4 +35,4 @@ test/__pycache__
 !test/cubeHex.msh
 
 .vimrc
-.occa/
\ No newline at end of file
+.occa/
diff --git a/3rdParty/gslib/.travis.yml b/3rdParty/gslib/.travis.yml
deleted file mode 100644
index 939c88fb2..000000000
--- a/3rdParty/gslib/.travis.yml
+++ /dev/null
@@ -1,36 +0,0 @@
-language: c
-
-before_install:
-   - export ROOT_DIR=`pwd`
-   - sudo apt-get update -qq
-   - sudo apt-get install -y mpich2 libmpich2-dev
-
-env:
-  matrix:
-    - TEST=crystal_test            NP=2
-    - TEST=findpts_el_2_test       NP=2
-    - TEST=findpts_el_2_test2      NP=2
-    - TEST=findpts_el_3_test       NP=2
-    - TEST=findpts_el_3_test2      NP=2
-    - TEST=findpts_el_2_test2      NP=2
-    - TEST=findpts_local_test      NP=2
-    - TEST=findpts_test            NP=2
-    - TEST=gs_test                 NP=2
-    - TEST=gs_test_old             NP=2
-    - TEST=gs_test_gop_nonblocking NP=2
-    - TEST=gs_test_gop_blocking    NP=2
-    - TEST=gs_unique_test          NP=2
-    - TEST=lob_bnd_test            NP=2
-    - TEST=obbox_test              NP=2
-    - TEST=poly_test               NP=2
-    - TEST=sarray_sort_test        NP=2
-    - TEST=sarray_transfer_test    NP=2
-    - TEST=sort_test               NP=2
-    - TEST=sort_test2              NP=2
-
-install: true
-
-script:
-  - cd $ROOT_DIR
-  - make CC=mpicc tests/$TEST
-  - mpiexec -np $NP ./tests/$TEST
diff --git a/3rdParty/gslib/LICENSE b/3rdParty/gslib/LICENSE
deleted file mode 100644
index d0904265f..000000000
--- a/3rdParty/gslib/LICENSE
+++ /dev/null
@@ -1,58 +0,0 @@
-Copyright (c) 2008-2017, UCHICAGO ARGONNE, LLC. 
-
-The UChicago Argonne, LLC as Operator of Argonne National
-Laboratory holds copyright in the Software. The copyright holder
-reserves all rights except those expressly granted to licensees,
-and U.S. Government license rights.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the disclaimer below.
-
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the disclaimer (as noted below)
-in the documentation and/or other materials provided with the
-distribution.
-
-3. Neither the name of ANL nor the names of its contributors
-may be used to endorse or promote products derived from this software
-without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
-FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 
-UCHICAGO ARGONNE, LLC, THE U.S. DEPARTMENT OF 
-ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 
-TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-Additional BSD Notice
----------------------
-1. This notice is required to be provided under our contract with
-the U.S. Department of Energy (DOE). This work was produced at
-Argonne National Laboratory under Contract 
-No. DE-AC02-06CH11357 with the DOE.
-
-2. Neither the United States Government nor UCHICAGO ARGONNE, 
-LLC nor any of their employees, makes any warranty, 
-express or implied, or assumes any liability or responsibility for the
-accuracy, completeness, or usefulness of any information, apparatus,
-product, or process disclosed, or represents that its use would not
-infringe privately-owned rights.
-
-3. Also, reference herein to any specific commercial products, process, 
-or services by trade name, trademark, manufacturer or otherwise does 
-not necessarily constitute or imply its endorsement, recommendation, 
-or favoring by the United States Government or UCHICAGO ARGONNE LLC. 
-The views and opinions of authors expressed 
-herein do not necessarily state or reflect those of the United States 
-Government or UCHICAGO ARGONNE, LLC, and shall 
-not be used for advertising or product endorsement purposes.
diff --git a/3rdParty/gslib/Makefile b/3rdParty/gslib/Makefile
deleted file mode 100644
index 7e32ac09e..000000000
--- a/3rdParty/gslib/Makefile
+++ /dev/null
@@ -1,187 +0,0 @@
-#####################################################################################
-#
-#The MIT License (MIT)
-#
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-#
-#Permission is hereby granted, free of charge, to any person obtaining a copy
-#of this software and associated documentation files (the "Software"), to deal
-#in the Software without restriction, including without limitation the rights
-#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-#copies of the Software, and to permit persons to whom the Software is
-#furnished to do so, subject to the following conditions:
-#
-#The above copyright notice and this permission notice shall be included in all
-#copies or substantial portions of the Software.
-#
-#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-#SOFTWARE.
-#
-#####################################################################################
-
-ifndef LIBP_MAKETOP_LOADED
-ifeq (,$(wildcard ../../make.top))
-  $(error cannot locate ${PWD}/../../make.top)
-else
-  include ../../make.top
-endif
-endif
-
-MPI ?= 1
-MPIIO ?= 1
-ADDUS ?= 1
-USREXIT ?= 0
-LIBNAME ?= gs
-BLAS ?= 0
-CFLAGS ?= $(CXXFLAGS)
-FFLAGS ?= $(CXXFLAGS)
-
-SRCROOT=.
-TESTDIR=$(SRCROOT)/tests
-FTESTDIR=$(TESTDIR)/fortran
-SRCDIR=$(SRCROOT)/src
-INCDIR=$(SRCROOT)/src
-LIBDIR=$(SRCROOT)/lib
-
-ifneq (,$(strip $(PREFIX)))
-INSTALL_ROOT = $(PREFIX)
-else
-INSTALL_ROOT = $(LIBDIR)
-endif
-
-ifneq (0,$(MPI))
-  G+=-DMPI
-endif
-
-ifneq (0,$(MPIIO))
-  ifneq (0,$(MPI))
-    G+=-DUSEMPIIO
-  endif
-endif
-
-ifneq (0,$(ADDUS))
-  G+=-DUNDERSCORE
-endif
-
-ifneq (0,$(USREXIT))
-  G+=-DUSE_USR_EXIT
-endif
-
-ifeq (0,$(BLAS))
-  G+=-DUSE_NAIVE_BLAS
-endif
-
-ifeq (1,$(BLAS))
-  G+=-DUSE_CBLAS
-endif
-
-ifneq ($(PREFIX),)
-  G+=-DPREFIX=$(PREFIX)
-endif
-
-ifneq ($(FPREFIX),)
-  G+=-DFPREFIX=$(FPREFIX)
-endif
-
-G+=-DGLOBAL_LONG_LONG
-#G+=-DPRINT_MALLOCS=1
-#G+=-DGS_TIMING -DGS_BARRIER
-
-CCCMD=$(LIBP_MPICC) $(LIBP_CFLAGS) -I$(INCDIR) $(G)
-FCCMD=$(FC) $(FFLAGS)
-
-LINKCMD=$(LIBP_MPICC) $(LIBP_CFLAGS) -I$(INCDIR) $(G) $^ -o $@ -L$(SRCDIR) \
-        -l$(LIBNAME) -lm $(LDFLAGS)
-
-TESTS=$(TESTDIR)/sort_test $(TESTDIR)/sort_test2 $(TESTDIR)/sarray_sort_test \
-      $(TESTDIR)/comm_test $(TESTDIR)/crystal_test \
-      $(TESTDIR)/sarray_transfer_test $(TESTDIR)/gs_test \
-      $(TESTDIR)/gs_test_gop_blocking $(TESTDIR)/gs_test_gop_nonblocking \
-      $(TESTDIR)/gs_unique_test $(TESTDIR)/gs_test_old \
-      $(TESTDIR)/findpts_el_2_test \
-      $(TESTDIR)/findpts_el_2_test2 $(TESTDIR)/findpts_el_3_test \
-      $(TESTDIR)/findpts_el_3_test2 $(TESTDIR)/findpts_local_test \
-      $(TESTDIR)/findpts_test $(TESTDIR)/poly_test \
-      $(TESTDIR)/lob_bnd_test $(TESTDIR)/obbox_test
-
-FTESTS=$(FTESTDIR)/f-igs
-
-GS=$(SRCDIR)/gs.o $(SRCDIR)/sort.o $(SRCDIR)/sarray_transfer.o \
-   $(SRCDIR)/sarray_sort.o $(SRCDIR)/gs_local.o $(SRCDIR)/fail.o \
-   $(SRCDIR)/crystal.o $(SRCDIR)/comm.o $(SRCDIR)/tensor.o
-
-FWRAPPER=$(SRCDIR)/fcrystal.o $(SRCDIR)/findpts.o
-INTP=$(SRCDIR)/findpts_local.o $(SRCDIR)/obbox.o $(SRCDIR)/poly.o \
-     $(SRCDIR)/lob_bnd.o $(SRCDIR)/findpts_el_3.o $(SRCDIR)/findpts_el_2.o
-
-.PHONY: all install deps tests clean objects odepinfo
-
-all : $(SRCDIR)/lib$(LIBNAME).a tests
-
-$(SRCDIR)/lib$(LIBNAME).a: $(GS) $(FWRAPPER) $(INTP) $(SRCDIR)/rand_elt_test.o
-ifneq (,${verbose})
-	$(AR) cr $(SRCDIR)/lib$(LIBNAME).a $?
-else
-	@printf "%b" "$(LIB_COLOR)Building library lib$(LIBNAME).a $(NO_COLOR)\n";
-	@$(AR) cr $(SRCDIR)/lib$(LIBNAME).a $?
-endif
-	@ranlib $(SRCDIR)/lib$(LIBNAME).a
-
-install: $(SRCDIR)/lib$(LIBNAME).a
-	@mkdir -p $(INSTALL_ROOT) 2>/dev/null
-	@cp $(SRCDIR)/lib$(LIBNAME).a $(INSTALL_ROOT) 2>/dev/null
-
-tests: $(TESTS) $(FTESTS)
-
-clean: ; @$(RM) $(SRCDIR)/*.o $(SRCDIR)/*.s $(SRCDIR)/*.a $(TESTS) $(TESTS)/*.o $(FTESTS) $(FTESTS)/*.o
-
-cmds: ; @echo LIBP_MPICC = $(CCCMD); echo LINK = $(LINKCMD);
-
-deps: ; ./cdep.py *.c > makefile.cdep;
-
-odepinfo: deps objects; @./odep_info.py *.o
-
-$(TESTS): % : %.o | lib
-	$(LINKCMD)
-
-$(FTESTS): % : %.o | lib
-	$(FCCMD) $^ -o $@ -L$(SRCDIR) -l$(LIBNAME)
-
--include makefile.cdep
-
-%.o: %.c
-ifneq (,${verbose})
-	$(CCCMD) -c $< -o $@
-else
-	@printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n";
-	@$(CCCMD) -c $< -o $@
-endif
-
-%.o: %.f
-ifneq (,${verbose})
-	$(FCCMD) -c $< -o $@
-else
-	@printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n";
-	@$(FCCMD) -c $< -o $@
-endif
-
-%.s: %.c
-ifneq (,${verbose})
-	$(CCCMD) -S $< -o $@
-else
-	@printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n";
-	@$(CCCMD) -S $< -o $@
-endif
-
-objects: $(OBJECTS)
-
-#poly_imp.h: gen_poly_imp.c
-#	$(RM) poly_imp.h;
-#	$(LIBP_MPICC) -lgmp -lm gen_poly_imp.c -o gen_poly_imp;
-#	./gen_poly_imp > poly_imp.h;
-#	$(RM) gen_poly_imp
diff --git a/3rdParty/gslib/README.md b/3rdParty/gslib/README.md
deleted file mode 100644
index 8ded873a3..000000000
--- a/3rdParty/gslib/README.md
+++ /dev/null
@@ -1,23 +0,0 @@
-# GSLIB 
-
-[![Build Status](https://travis-ci.org/gslib/gslib.svg?branch=master)](https://travis-ci.org/gslib/gslib)
-
-* Scalable Many-to-Many collectives
-* Robust interpolation for hexahedral spectral element meshes
-
-# Build Instructions
-
-The build system relies on GNU Make with the `make` command. To compile gslib just run:
-
-```
-make CC=mpicc FC=mpif77
-make PREFIX=<install path> install
-```
-
-# Applications
-
-**\[1]&#160;[Nek5000](https://nek5000.mcs.anl.gov/)**: Nek5000 open-source, spectral element code.
-
-**\[2]&#160;[CEED](http://ceed.exascaleproject.org/)**: Co-design center for Efficient Exascale Discretizations.
-
-**\[3]&#160;[Nektar++](http://www.nektar.info)**: Nektar++ open-source spectral/hp element code.
diff --git a/3rdParty/gslib/RELEASE.md b/3rdParty/gslib/RELEASE.md
deleted file mode 100644
index fdaf06f49..000000000
--- a/3rdParty/gslib/RELEASE.md
+++ /dev/null
@@ -1,17 +0,0 @@
-# Release 1.0.3
-
-## Major Features and Improvements
-* Added non-blocking gather/scatter operations (CR not supported yet)
-* Added Fortran wrapper for gs_unique
-* Added gs_hf2c to convert Fortran into C handle
-
-## Backwards-Incompatible Changes 
-* Removed XXT and AMG solver from distribution
-
-## Bug Fixes and Other Changes
-
-[17](https://github.com/gslib/gslib/issues/17)
-
-## Thanks to our Contributors
-This release contains contributions from: @stgeke 
-We are also grateful to all who filed issues or helped resolve them, asked and answered questions, and were part of inspiring discussions.
diff --git a/3rdParty/gslib/cdep.py b/3rdParty/gslib/cdep.py
deleted file mode 100755
index a0dd87a50..000000000
--- a/3rdParty/gslib/cdep.py
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/usr/bin/python
-
-import sys, os, re
-
-#mergestr = lambda x: reduce((lambda a,b: a+" "+b),x,"")
-
-pathjoin = lambda a,b: os.path.normpath(os.path.join(a,b))
-include_re = re.compile("\s*#\s*include\s*\"([^\"]*)\"")
-incmatch = lambda x: ( include_re.match(line) for line in open(x) )
-incline = lambda x,m: pathjoin(os.path.split(x)[0],m.group(1))
-incl = lambda x: [ incline(x,m) for m in incmatch(x) if m!=None ]
-includes = {}
-def get_include(x):
-	if not includes.has_key(x): includes[x] = incl(x)
-	return includes[x]
-
-def closure(seq,f):
-	v = [], [x for x in seq], set(x for x in seq)
-	while len(v[1]): [(v[1].append(y),v[2].add(y)) for y in 
-	  f((lambda x: (v[0].append(x),x)[1])(v[1].pop())) if not y in v[2]]
-	return v[0]
-
-src_files = sys.argv[1:]
-files = closure(src_files, get_include)
-deps = dict((x,closure(includes[x],lambda y: includes[y])) for x in src_files)
-
-obj = lambda x: os.path.splitext(x)[0]+".o"
-
-for x in src_files:
-	print obj(x)+": "+x+reduce((lambda a,b: a+" "+b),deps[x],"")
-
-print
-print "OBJECTS="+reduce((lambda a,b: a+" "+obj(b)),src_files,"")
diff --git a/3rdParty/gslib/makefile.cdep b/3rdParty/gslib/makefile.cdep
deleted file mode 100644
index e5c6ee766..000000000
--- a/3rdParty/gslib/makefile.cdep
+++ /dev/null
@@ -1,42 +0,0 @@
-amg.o: amg.c gs.h sarray_transfer.h crystal.h comm.h gs_defs.h sarray_sort.h sort.h mem.h fail.h types.h name.h c99.h
-comm.o: comm.c comm.h gs_local.h gs_defs.h tensor.h types.h fail.h name.h
-comm_test.o: comm_test.c comm.h gs_defs.h types.h fail.h name.h
-crs_test.o: crs_test.c crs.h gs.h comm.h gs_defs.h mem.h types.h fail.h name.h c99.h
-crystal.o: crystal.c mem.h comm.h types.h fail.h name.h c99.h
-crystal_test.o: crystal_test.c crystal.h mem.h comm.h types.h fail.h name.h c99.h
-fail.o: fail.c comm.h types.h fail.h name.h
-fcrs.o: fcrs.c crs.h comm.h mem.h types.h fail.h name.h c99.h
-fcrystal.o: fcrystal.c sarray_transfer.h sarray_sort.h sort.h crystal.h comm.h mem.h types.h fail.h name.h c99.h
-findpts.o: findpts.c findpts_imp.h findpts_imp.h sarray_sort.h sort.h sarray_transfer.h crystal.h comm.h gs_defs.h findpts_local.h findpts_el.h obbox.h poly.h mem.h fail.h types.h name.h c99.h
-findpts_el_2.o: findpts_el_2.c poly.h tensor.h mem.h types.h fail.h name.h c99.h
-findpts_el_2_test2.o: findpts_el_2_test2.c rand_elt_test.h findpts_el.h obbox.h lob_bnd.h poly.h tensor.h mem.h fail.h name.h types.h c99.h
-findpts_el_2_test.o: findpts_el_2_test.c findpts_el.h poly.h mem.h fail.h types.h name.h c99.h
-findpts_el_3.o: findpts_el_3.c poly.h tensor.h mem.h types.h fail.h name.h c99.h
-findpts_el_3_test2.o: findpts_el_3_test2.c rand_elt_test.h findpts_el.h obbox.h lob_bnd.h poly.h tensor.h mem.h fail.h name.h types.h c99.h
-findpts_el_3_test.o: findpts_el_3_test.c findpts_el.h poly.h mem.h fail.h types.h name.h c99.h
-findpts_local.o: findpts_local.c findpts_local_imp.h findpts_local_imp.h findpts_el.h sarray_sort.h sort.h poly.h obbox.h mem.h fail.h name.h types.h c99.h
-findpts_local_test.o: findpts_local_test.c rand_elt_test.h findpts_local.h findpts_el.h obbox.h poly.h types.h mem.h fail.h name.h c99.h
-findpts_test.o: findpts_test.c sarray_transfer.h crystal.h findpts.h rand_elt_test.h comm.h gs_defs.h poly.h mem.h types.h fail.h name.h c99.h
-gen_poly_imp.o: gen_poly_imp.c
-gs.o: gs.c sarray_transfer.h sarray_sort.h crystal.h sort.h mem.h comm.h gs_local.h gs_defs.h types.h fail.h name.h c99.h
-gs_local.o: gs_local.c gs_defs.h types.h name.h c99.h
-gs_test.o: gs_test.c gs.h gs_defs.h mem.h comm.h types.h fail.h name.h c99.h
-gs_test_old.o: gs_test_old.c types.h name.h
-gs_unique_test.o: gs_unique_test.c gs.h gs_defs.h mem.h comm.h types.h fail.h name.h c99.h
-lob_bnd.o: lob_bnd.c poly.h mem.h fail.h types.h name.h c99.h
-lob_bnd_test.o: lob_bnd_test.c lob_bnd.h poly.h tensor.h mem.h fail.h name.h types.h c99.h
-obbox.o: obbox.c lob_bnd.h poly.h tensor.h mem.h types.h fail.h name.h c99.h
-obbox_test.o: obbox_test.c rand_elt_test.h obbox.h lob_bnd.h poly.h mem.h fail.h name.h types.h c99.h
-poly.o: poly.c poly_imp.h mem.h types.h fail.h name.h c99.h
-poly_test.o: poly_test.c poly.h types.h name.h c99.h
-rand_elt_test.o: rand_elt_test.c lob_bnd.h poly.h name.h types.h c99.h
-sarray_sort.o: sarray_sort.c sort.h mem.h fail.h types.h name.h c99.h
-sarray_sort_test.o: sarray_sort_test.c sarray_sort.h sort.h mem.h types.h fail.h name.h c99.h
-sarray_transfer.o: sarray_transfer.c sort.h crystal.h mem.h comm.h types.h fail.h name.h c99.h
-sarray_transfer_test.o: sarray_transfer_test.c sarray_transfer.h crystal.h sarray_sort.h sort.h mem.h comm.h types.h fail.h name.h c99.h
-sort.o: sort.c sort_imp.h sort_imp.h sort_imp.h mem.h types.h fail.h name.h c99.h
-sort_test2.o: sort_test2.c sort.h mem.h types.h fail.h name.h c99.h
-sort_test.o: sort_test.c sort.h mem.h types.h fail.h name.h c99.h
-tensor.o: tensor.c types.h name.h c99.h
-
-OBJECTS= comm.o comm_test.o crs_test.o crystal.o crystal_test.o fail.o fcrs.o fcrystal.o findpts.o findpts_el_2.o findpts_el_2_test2.o findpts_el_2_test.o findpts_el_3.o findpts_el_3_test2.o findpts_el_3_test.o findpts_local.o findpts_local_test.o findpts_test.o gen_poly_imp.o gs.o gs_local.o gs_test.o gs_test_old.o gs_unique_test.o lob_bnd.o lob_bnd_test.o obbox.o obbox_test.o poly.o poly_test2.o poly_test.o rand_elt_test.o sarray_sort.o sarray_sort_test.o sarray_transfer.o sarray_transfer_test.o sort.o sort_test2.o sort_test.o tensor.o
diff --git a/3rdParty/gslib/odep_info.py b/3rdParty/gslib/odep_info.py
deleted file mode 100755
index 620d0ec4a..000000000
--- a/3rdParty/gslib/odep_info.py
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/usr/bin/python
-
-import sys, os, re
-
-obj_files = sys.argv[1:]
-
-defined = dict((x,set([])) for x in obj_files)
-undefined = dict((x,set([])) for x in obj_files)
-nm_re = re.compile("[0-9a-fA-F]*\s*([BCDRTU])\s+([A-Za-z_][A-Za-z_0-9]*)\s*")
-def nm_match(x): return ( nm_re.match(line) for line in os.popen('nm -g '+x) )
-def nm_line(x,m):
-	if m.group(1)=='U': undefined[x].add(m.group(2))
-	else: defined[x].add(m.group(2))
-[ [ nm_line(x,m) for m in nm_match(x) if m!=None ] for x in obj_files ]
-
-def closure(seq,f):
-	v = [], [x for x in seq], set(x for x in seq)
-	while len(v[1]): [(v[1].append(y),v[2].add(y)) for y in 
-	  f((lambda x: (v[0].append(x),x)[1])(v[1].pop())) if not y in v[2]]
-	return v[0]
-
-needs={}
-def get_needs(x):
-	if not needs.has_key(x):
-		needs[x]=[y for y in obj_files if len(defined[y]&undefined[x])]
-	return needs[x]
-deps = dict((x,closure(get_needs(x),get_needs)) for x in obj_files)
-
-for x in deps:
-	print x,'depends on',reduce((lambda a,b: a+" "+b),deps[x],"")
-print
-
-results = [ os.path.splitext(x)[0] for x in obj_files if 'main' in defined[x] ]
-print "RESULTS="+reduce((lambda a,b: a+" "+b),results,"")
-print
-
-def need_X(objs):
-	for x in objs:
-		if "XOpenDisplay" in undefined[x]: return True
-	return False
-
-for x in results:
-	objs = deps[x+'.o'];
-	if not (x+'.o') in objs: objs.append(x+'.o')
-	sobjs = reduce((lambda a,b: a+" "+b),objs,"")
-	if need_X(objs):
-		print x+":"+sobjs+" ; @echo LINK $@; $(LINKCMD) $^ -lX11 -o $@"
-	else:
-		print x+":"+sobjs+" ; @echo LINK $@; $(LINKCMD) $^ -o $@"
-
diff --git a/3rdParty/gslib/src/c99.h b/3rdParty/gslib/src/c99.h
deleted file mode 100644
index a5a44e3a6..000000000
--- a/3rdParty/gslib/src/c99.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef C99_H
-#define C99_H
-
-#ifndef __STDC_VERSION__
-#  define NO_C99
-#elif __STDC_VERSION__ < 199901L
-#  define NO_C99
-#endif
-
-#ifdef NO_C99
-#  define restrict
-#  define inline
-#  undef NO_C99
-#endif
-
-#endif
diff --git a/3rdParty/gslib/src/comm.c b/3rdParty/gslib/src/comm.c
deleted file mode 100644
index e537278f4..000000000
--- a/3rdParty/gslib/src/comm.c
+++ /dev/null
@@ -1,210 +0,0 @@
-#include <stddef.h> /* for size_t */
-#include <stdlib.h> /* for exit */
-#include <string.h> /* memcpy */
-#include <limits.h> /* for gs identities */
-#include <float.h>  /* for gs identities */
-#include "name.h"
-#include "fail.h"
-#include "types.h"
-#include "tensor.h"
-#include "gs_defs.h"
-#include "gs_local.h"
-#include "comm.h"
-
-uint comm_gbl_id=0, comm_gbl_np=1;
-
-GS_DEFINE_IDENTITIES()
-GS_DEFINE_DOM_SIZES()
-
-static void scan_imp(void *scan, const struct comm *com, gs_dom dom, gs_op op,
-                     const void *v, uint vn, void *buffer)
-{
-  comm_req req[2];
-  size_t vsize = vn*gs_dom_size[dom];
-  const uint id=com->id, np=com->np;
-  uint n = np, c=1, odd=0, base=0;
-  void *buf[2];
-  void *red = (char*)scan+vsize;
-  buf[0]=buffer,buf[1]=(char*)buffer+vsize;
-  while(n>1) {
-    odd=(odd<<1)|(n&1);
-    c<<=1, n>>=1;
-    if(id>=base+n) c|=1, base+=n, n+=(odd&1);
-  }
-  gs_init_array(scan,vn,dom,op);
-  memcpy(red,v,vsize);
-  while(n<np) {
-    if(c&1) n-=(odd&1), base-=n;
-    c>>=1, n<<=1, n+=(odd&1);
-    odd>>=1;
-    if(base==id) {
-      comm_irecv(&req[0],com, buf[0],vsize, id+n/2,id+n/2);
-      comm_isend(&req[1],com, red   ,vsize, id+n/2,id);
-      comm_wait(req,2);
-      gs_gather_array(red,buf[0],vn,dom,op);
-    } else {
-      comm_irecv(&req[0],com, scan,vsize, base,base);
-      comm_isend(&req[1],com, red ,vsize, base,id);
-      comm_wait(req,2);
-      break;
-    }
-  }
-  while(n>1) {
-    if(base==id) {
-      comm_send(com, scan  ,2*vsize, id+n/2,id);
-    } else {
-      comm_recv(com, buffer,2*vsize, base,base);
-      gs_gather_array(scan,buf[0],vn,dom,op);
-      memcpy(red,buf[1],vsize);
-    }
-    odd=(odd<<1)|(n&1);
-    c<<=1, n>>=1;
-    if(id>=base+n) c|=1, base+=n, n+=(odd&1);
-  }
-}
-
-
-static void allreduce_imp(const struct comm *com, gs_dom dom, gs_op op,
-                          void *v, uint vn, void *buf)
-{
-  size_t total_size = vn*gs_dom_size[dom];
-  const uint id=com->id, np=com->np;
-  uint n = np, c=1, odd=0, base=0;
-  while(n>1) {
-    odd=(odd<<1)|(n&1);
-    c<<=1, n>>=1;
-    if(id>=base+n) c|=1, base+=n, n+=(odd&1);
-  }
-  while(n<np) {
-    if(c&1) n-=(odd&1), base-=n;
-    c>>=1, n<<=1, n+=(odd&1);
-    odd>>=1;
-    if(base==id) {
-      comm_recv(com, buf,total_size, id+n/2,id+n/2);
-      gs_gather_array(v,buf,vn, dom,op);
-    } else {
-      comm_send(com, v,total_size, base,id);
-      break;
-    }
-  }
-  while(n>1) {
-    if(base==id)
-      comm_send(com, v,total_size, id+n/2,id);
-    else
-      comm_recv(com, v,total_size, base,base);
-    odd=(odd<<1)|(n&1);
-    c<<=1, n>>=1;
-    if(id>=base+n) c|=1, base+=n, n+=(odd&1);
-  }
-}
-
-void comm_scan(void *scan, const struct comm *com, gs_dom dom, gs_op op,
-               const void *v, uint vn, void *buffer)
-{
-  scan_imp(scan, com,dom,op, v,vn, buffer);
-}
-
-void comm_allreduce(const struct comm *com, gs_dom dom, gs_op op,
-                          void *v, uint vn, void *buf)
-{
-  if(vn==0) return;
-#ifdef MPI
-  {
-    MPI_Datatype mpitype;
-    MPI_Op mpiop;
-    #define DOMAIN_SWITCH() do { \
-      switch(dom) { case gs_double:    mpitype=MPI_DOUBLE;    break; \
-                    case gs_float:     mpitype=MPI_FLOAT;     break; \
-                    case gs_int:       mpitype=MPI_INT;       break; \
-                    case gs_long:      mpitype=MPI_LONG;      break; \
-     WHEN_LONG_LONG(case gs_long_long: mpitype=MPI_LONG_LONG; break;) \
-                  default:        goto comm_allreduce_byhand; \
-      } \
-    } while(0)
-    DOMAIN_SWITCH();
-    #undef DOMAIN_SWITCH
-    switch(op) { case gs_add: mpiop=MPI_SUM;  break;
-                 case gs_mul: mpiop=MPI_PROD; break;
-                 case gs_min: mpiop=MPI_MIN;  break;
-                 case gs_max: mpiop=MPI_MAX;  break;
-                 default:        goto comm_allreduce_byhand;
-    }
-    MPI_Allreduce(v,buf,vn,mpitype,mpiop,com->c);
-    memcpy(v,buf,vn*gs_dom_size[dom]);
-    return;
-  }
-#endif
-#ifdef MPI
-comm_allreduce_byhand:
-  allreduce_imp(com,dom,op, v,vn, buf);
-#endif
-}
-
-void comm_iallreduce(comm_req *req, const struct comm *com, gs_dom dom, gs_op op,
-                          void *v, uint vn, void *buf)
-{
-  if(vn==0) return;
-#ifdef MPI
-  {
-    MPI_Datatype mpitype;
-    MPI_Op mpiop;
-    #define DOMAIN_SWITCH() do { \
-      switch(dom) { case gs_double:    mpitype=MPI_DOUBLE;    break; \
-                    case gs_float:     mpitype=MPI_FLOAT;     break; \
-                    case gs_int:       mpitype=MPI_INT;       break; \
-                    case gs_long:      mpitype=MPI_LONG;      break; \
-     WHEN_LONG_LONG(case gs_long_long: mpitype=MPI_LONG_LONG; break;) \
-                  default:        goto comm_allreduce_byhand; \
-      } \
-    } while(0)
-    DOMAIN_SWITCH();
-    #undef DOMAIN_SWITCH
-    switch(op) { case gs_add: mpiop=MPI_SUM;  break;
-                 case gs_mul: mpiop=MPI_PROD; break;
-                 case gs_min: mpiop=MPI_MIN;  break;
-                 case gs_max: mpiop=MPI_MAX;  break;
-                 default:        goto comm_allreduce_byhand;
-    }
-    MPI_Iallreduce(v,buf,vn,mpitype,mpiop,com->c,req);
-    return;
-  }
-#endif
-#ifdef MPI
-comm_allreduce_byhand:
-  allreduce_imp(com,dom,op, v,vn, buf);
-#endif
-}
-
-double comm_dot(const struct comm *comm, double *v, double *w, uint n)
-{
-  double s=tensor_dot(v,w,n),b;
-  comm_allreduce(comm,gs_double,gs_add, &s,1, &b);
-  return s;
-}
-
-/* T comm_reduce__T(const struct comm *comm, gs_op op, const T *in, uint n) */
-
-#define SWITCH_OP_CASE(T,OP) case gs_##OP: WITH_OP(T,OP); break;
-#define SWITCH_OP(T,op) do switch(op) { \
-    GS_FOR_EACH_OP(T,SWITCH_OP_CASE) case gs_op_n: break; } while(0)
-
-#define WITH_OP(T,OP) \
-  do { T v = *in++; GS_DO_##OP(accum,v); } while(--n)
-
-#define DEFINE_REDUCE(T) \
-T PREFIXED_NAME(comm_reduce__##T)( \
-    const struct comm *comm, gs_op op, const T *in, uint n) \
-{                                                           \
-  T accum = gs_identity_##T[op], buf;                       \
-  if(n!=0) SWITCH_OP(T,op);                                 \
-  comm_allreduce(comm,gs_##T,op, &accum,1, &buf);           \
-  return accum;                                             \
-}
-
-GS_FOR_EACH_DOMAIN(DEFINE_REDUCE)
-
-#undef DEFINE_REDUCE
-#undef WITH_OP
-#undef SWITCH_OP
-#undef SWITCH_OP_CASE
-
diff --git a/3rdParty/gslib/src/comm.h b/3rdParty/gslib/src/comm.h
deleted file mode 100644
index 1bd88264a..000000000
--- a/3rdParty/gslib/src/comm.h
+++ /dev/null
@@ -1,259 +0,0 @@
-#ifndef COMM_H
-#define COMM_H
-
-/* requires:
-     <stddef.h>            for size_t
-     <stdlib.h>            for exit
-     "fail.h", "types.h"
-     "gs_defs.h"           for comm_allreduce, comm_scan, comm_reduce_T
-*/
-
-#if !defined(FAIL_H) || !defined(TYPES_H)
-#warning "comm.h" requires "fail.h" and "types.h"
-#endif
-
-/*
-  When the preprocessor macro MPI is defined, defines (very) thin wrappers
-  for the handful of used MPI routines. Alternatively, when MPI is not defined,
-  these wrappers become dummy routines suitable for a single process run.
-  No code outside of "comm.h" and "comm.c" makes use of MPI at all.
-
-  Basic usage:
-  
-    struct comm c;
-  
-    comm_init(&c, MPI_COMM_WORLD);  // initializes c using MPI_Comm_dup
-
-    comm_free(&c);
-  
-  Very thin MPI wrappers: (see below for implementation)
-
-    comm_send,_recv,_isend,_irecv,_time,_barrier
-    
-  Additionally, some reduction and scan routines are provided making use
-    of the definitions in "gs_defs.h" (provided this has been included first).
-
-  Example comm_allreduce usage:
-    
-    double v[5], buf[5];
-    comm_allreduce(&c, gs_double,gs_add, v,5,buf);
-      // Computes the vector sum of v across all procs, using
-      // buf as a scratch area. Delegates to MPI_Allreduce if possible.
-    
-  Example comm_scan usage:
-    
-    long in[5], out[2][5], buf[2][5];
-    comm_scan(out, &c,gs_long,gs_add, in,5,buf);
-      // out[0] will be the vector sum of "in" across procs with ids
-           *strictly* less than this one (exclusive behavior),
-         and out[1] will be the vector sum across all procs, as would
-           be computed with comm_allreduce.
-         Note: differs from MPI_Scan which has inclusive behavior
-  
-  Example comm_reduce_double, etc. usage:
-  
-    T out, in[10];
-    out = comm_reduce_T(&c, gs_max, in, 10);
-      // out will equal the largest element of "in",
-         across all processors
-      // T can be "double", "float", "int", "long", "slong", "sint", etc.
-         as defined in "gs_defs.h"
-         
-*/
-
-#ifdef MPI
-#include <mpi.h>
-typedef MPI_Comm comm_ext;
-typedef MPI_Request comm_req;
-#else
-typedef int comm_ext;
-typedef int comm_req;
-typedef int MPI_Fint;
-#endif
-
-#define comm_allreduce  PREFIXED_NAME(comm_allreduce )
-#define comm_iallreduce PREFIXED_NAME(comm_iallreduce)
-#define comm_scan       PREFIXED_NAME(comm_scan      )
-#define comm_dot        PREFIXED_NAME(comm_dot       )
-
-/* global id, np vars strictly for diagnostic messages (fail.c) */
-#ifndef comm_gbl_id
-#define comm_gbl_id PREFIXED_NAME(comm_gbl_id)
-#define comm_gbl_np PREFIXED_NAME(comm_gbl_np)
-extern uint comm_gbl_id, comm_gbl_np;
-#endif
-
-struct comm {
-  uint id, np;
-  comm_ext c;
-};
-
-static void comm_init(struct comm *c, comm_ext ce);
-/* (macro) static void comm_init_check(struct comm *c, MPI_Fint ce, uint np); */
-/* (macro) static void comm_dup(struct comm *d, const struct comm *s); */
-static void comm_free(struct comm *c);
-static double comm_time(void);
-static void comm_barrier(const struct comm *c);
-static void comm_recv(const struct comm *c, void *p, size_t n,
-                      uint src, int tag);
-static void comm_send(const struct comm *c, void *p, size_t n,
-                      uint dst, int tag);
-static void comm_irecv(comm_req *req, const struct comm *c,
-                       void *p, size_t n, uint src, int tag);
-static void comm_isend(comm_req *req, const struct comm *c,
-                       void *p, size_t n, uint dst, int tag);
-static void comm_wait(comm_req *req, int n);
-
-double comm_dot(const struct comm *comm, double *v, double *w, uint n);
-
-#ifdef GS_DEFS_H
-void comm_allreduce(const struct comm *com, gs_dom dom, gs_op op,
-                          void *v, uint vn, void *buf);
-void comm_iallreduce(comm_req *req, const struct comm *com, gs_dom dom, gs_op op,
-                          void *v, uint vn, void *buf);
-void comm_scan(void *scan, const struct comm *com, gs_dom dom, gs_op op,
-               const void *v, uint vn, void *buffer);
-
-#define DEFINE_REDUCE(T) \
-T PREFIXED_NAME(comm_reduce__##T)( \
-    const struct comm *comm, gs_op op, const T *in, uint n); \
-static T comm_reduce_##T(const struct comm *c, gs_op op, const T *v, uint vn) \
-{ return PREFIXED_NAME(comm_reduce__##T)(c,op,v,vn); }
-GS_FOR_EACH_DOMAIN(DEFINE_REDUCE)
-#undef DEFINE_REDUCE
-
-#define comm_reduce_sint \
-    TYPE_LOCAL(comm_reduce_int,comm_reduce_long,comm_reduce_long_long)
-#define comm_reduce_slong \
-   TYPE_GLOBAL(comm_reduce_int,comm_reduce_long,comm_reduce_long_long)
-
-#endif
-
-/*----------------------------------------------------------------------------
-  Code for static (inline) functions
-  ----------------------------------------------------------------------------*/
-
-static void comm_init(struct comm *c, comm_ext ce)
-{
-#ifdef MPI
-  int i;
-  MPI_Comm_dup(ce, &c->c);
-  MPI_Comm_rank(c->c,&i), comm_gbl_id=c->id=i;
-  MPI_Comm_size(c->c,&i), comm_gbl_np=c->np=i;
-#else
-  c->id = 0, c->np = 1;
-#endif
-}
-
-static void comm_init_check_(struct comm *c, MPI_Fint ce, uint np,
-                             const char *file, unsigned line)
-{
-#ifdef MPI
-  comm_init(c,MPI_Comm_f2c(ce));
-  if(c->np != np)
-    fail(1,file,line,"comm_init_check: passed P=%u, "
-                     "but MPI_Comm_size gives P=%u",
-                     (unsigned)np,(unsigned)c->np);
-#else
-  comm_init(c,0);
-  if(np != 1)
-    fail(1,file,line,"comm_init_check: passed P=%u, "
-                     "but not compiled with -DMPI",(unsigned)np);
-#endif
-}
-#define comm_init_check(c,ce,np) comm_init_check_(c,ce,np,__FILE__,__LINE__)
-
-
-static void comm_dup_(struct comm *d, const struct comm *s,
-                      const char *file, unsigned line)
-{
-  d->id = s->id, d->np = s->np;
-#ifdef MPI
-  MPI_Comm_dup(s->c,&d->c);
-#else
-  if(s->np!=1) fail(1,file,line,"%s not compiled with -DMPI\n",file);
-#endif
-}
-#define comm_dup(d,s) comm_dup_(d,s,__FILE__,__LINE__)
-
-static void comm_free(struct comm *c)
-{
-#ifdef MPI
-  MPI_Comm_free(&c->c);
-#endif
-}
-
-static double comm_time(void)
-{
-#ifdef MPI
-  return MPI_Wtime();
-#else
-  return 0;
-#endif
-}
-
-static void comm_barrier(const struct comm *c)
-{
-#ifdef MPI
-  MPI_Barrier(c->c);
-#endif
-}
-
-static void comm_recv(const struct comm *c, void *p, size_t n,
-                      uint src, int tag)
-{
-#ifdef MPI
-# ifndef MPI_STATUS_IGNORE
-  MPI_Status stat;
-  MPI_Recv(p,n,MPI_UNSIGNED_CHAR,src,tag,c->c,&stat);
-# else  
-  MPI_Recv(p,n,MPI_UNSIGNED_CHAR,src,tag,c->c,MPI_STATUS_IGNORE);
-# endif
-#endif
-}
-
-static void comm_send(const struct comm *c, void *p, size_t n,
-                      uint dst, int tag)
-{
-#ifdef MPI
-  MPI_Send(p,n,MPI_UNSIGNED_CHAR,dst,tag,c->c);
-#endif
-}
-
-static void comm_irecv(comm_req *req, const struct comm *c,
-                       void *p, size_t n, uint src, int tag)
-{
-#ifdef MPI
-  MPI_Irecv(p,n,MPI_UNSIGNED_CHAR,src,tag,c->c,req);
-#endif
-}
-
-static void comm_isend(comm_req *req, const struct comm *c,
-                       void *p, size_t n, uint dst, int tag)
-{
-#ifdef MPI
-  MPI_Isend(p,n,MPI_UNSIGNED_CHAR,dst,tag,c->c,req);
-#endif
-}
-
-static void comm_wait(comm_req *req, int n)
-{
-#ifdef MPI
-# ifndef MPI_STATUSES_IGNORE
-  MPI_Status status[8];
-  while(n>=8) MPI_Waitall(8,req,status), req+=8, n-=8;
-  if(n>0) MPI_Waitall(n,req,status);
-# else
-  MPI_Waitall(n,req,MPI_STATUSES_IGNORE);
-# endif  
-#endif
-}
-
-static void comm_bcast(const struct comm *c, void *p, size_t n, uint root)
-{
-#ifdef MPI
-  MPI_Bcast(p,n,MPI_UNSIGNED_CHAR,root,c->c);
-#endif
-}
-
-#endif
diff --git a/3rdParty/gslib/src/crs.h b/3rdParty/gslib/src/crs.h
deleted file mode 100644
index eeb60d33c..000000000
--- a/3rdParty/gslib/src/crs.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef CRS_H
-#define CRS_H
-
-#if !defined(COMM_H)
-#warning "crs.h" requires "comm.h"
-#endif
-
-#define crs_xxt_setup PREFIXED_NAME(crs_xxt_setup)
-#define crs_xxt_solve PREFIXED_NAME(crs_xxt_solve)
-#define crs_xxt_stats PREFIXED_NAME(crs_xxt_stats)
-#define crs_xxt_free  PREFIXED_NAME(crs_xxt_free )
-
-#define crs_amg_setup PREFIXED_NAME(crs_amg_setup)
-#define crs_amg_solve PREFIXED_NAME(crs_amg_solve)
-#define crs_amg_stats PREFIXED_NAME(crs_amg_stats)
-#define crs_amg_free  PREFIXED_NAME(crs_amg_free )
-
-struct crs_data;
-
-struct crs_data *crs_xxt_setup(
-  uint n, const ulong *id,
-  uint nz, const uint *Ai, const uint *Aj, const double *A,
-  uint null_space, const struct comm *comm);
-void crs_xxt_solve(double *x, struct crs_data *data, double *b);
-void crs_xxt_stats(struct crs_data *data);
-void crs_xxt_free(struct crs_data *data);
-
-struct crs_data *crs_amg_setup(
-  uint n, const ulong *id,
-  uint nz, const uint *Ai, const uint *Aj, const double *A,
-  uint null_space, const struct comm *comm);
-void crs_amg_solve(double *x, struct crs_data *data, double *b);
-void crs_amg_stats(struct crs_data *data);
-void crs_amg_free(struct crs_data *data);
-
-#endif
diff --git a/3rdParty/gslib/src/crystal.c b/3rdParty/gslib/src/crystal.c
deleted file mode 100644
index a0e813508..000000000
--- a/3rdParty/gslib/src/crystal.c
+++ /dev/null
@@ -1,141 +0,0 @@
-/*------------------------------------------------------------------------------
-  
-  Crystal Router
-  
-  Accomplishes all-to-all communication in log P msgs per proc
-  The routine is low-level; the format of the input/output is an
-  array of integers, consisting of a sequence of messages with format:
-  
-      target proc
-      source proc
-      m
-      integer
-      integer
-      ...
-      integer  (m integers in total)
-
-  Before crystal_router is called, the source of each message should be
-  set to this proc id; upon return from crystal_router, the target of each
-  message will be this proc id.
-  
-  Example Usage:
-  
-    struct crystal cr;
-    
-    crystal_init(&cr, &comm);  // makes an internal copy of comm
-    
-    crystal.data.n = ... ;  // total number of integers (not bytes!)
-    buffer_reserve(&cr.data, crystal.n * sizeof(uint));
-    ... // fill cr.data.ptr with messages
-    crystal_router(&cr);
-    
-    crystal_free(&cr);
-    
-  ----------------------------------------------------------------------------*/
-
-#include <stddef.h>
-#include <stdlib.h>
-#include <string.h>
-#include "c99.h"
-#include "name.h"
-#include "fail.h"
-#include "types.h"
-#include "comm.h"
-#include "mem.h"
-
-#define crystal_init   PREFIXED_NAME(crystal_init  )
-#define crystal_free   PREFIXED_NAME(crystal_free  )
-#define crystal_router PREFIXED_NAME(crystal_router)
-
-struct crystal {
-  struct comm comm;
-  buffer data, work;
-};
-
-void crystal_init(struct crystal *p, const struct comm *comm)
-{
-  comm_dup(&p->comm, comm);
-  buffer_init(&p->data,1000);
-  buffer_init(&p->work,1000);
-}
-
-void crystal_free(struct crystal *p)
-{
-  comm_free(&p->comm);
-  buffer_free(&p->data);
-  buffer_free(&p->work);
-}
-
-static void uintcpy(uint *dst, const uint *src, uint n)
-{
-  if(dst+n<=src)    memcpy (dst,src,n*sizeof(uint));
-  else if(dst!=src) memmove(dst,src,n*sizeof(uint));
-}
-
-static uint crystal_move(struct crystal *p, uint cutoff, int send_hi)
-{
-  uint len, *src, *end;
-  uint *keep = p->data.ptr, *send;
-  uint n = p->data.n;
-  send = buffer_reserve(&p->work,n*sizeof(uint));
-  if(send_hi) { /* send hi, keep lo */
-    for(src=keep,end=keep+n; src<end; src+=len) {
-      len = 3 + src[2];
-      if(src[0]>=cutoff) memcpy (send,src,len*sizeof(uint)), send+=len;
-      else               uintcpy(keep,src,len),              keep+=len;
-    }
-  } else      { /* send lo, keep hi */
-    for(src=keep,end=keep+n; src<end; src+=len) {
-      len = 3 + src[2];
-      if(src[0]< cutoff) memcpy (send,src,len*sizeof(uint)), send+=len;
-      else               uintcpy(keep,src,len),              keep+=len;
-    }
-  }
-  p->data.n = keep - (uint*)p->data.ptr;
-  return      send - (uint*)p->work.ptr;
-}
-
-static void crystal_exchange(struct crystal *p, uint send_n, uint targ,
-                             int recvn, int tag)
-{
-  comm_req req[3];
-  uint count[2] = {0,0}, sum, *recv[2];
-
-  if(recvn)   
-    comm_irecv(&req[1],&p->comm, &count[0],sizeof(uint), targ        ,tag);
-  if(recvn==2)
-    comm_irecv(&req[2],&p->comm, &count[1],sizeof(uint), p->comm.id-1,tag);
-  comm_isend(&req[0],&p->comm, &send_n,sizeof(uint), targ,tag);
-  comm_wait(req,recvn+1);
-  
-  sum = p->data.n + count[0] + count[1];
-  buffer_reserve(&p->data,sum*sizeof(uint));
-  recv[0] = (uint*)p->data.ptr + p->data.n, recv[1] = recv[0] + count[0];
-  p->data.n = sum;
-  
-  if(recvn)    comm_irecv(&req[1],&p->comm,
-                          recv[0],count[0]*sizeof(uint), targ        ,tag+1);
-  if(recvn==2) comm_irecv(&req[2],&p->comm,
-                          recv[1],count[1]*sizeof(uint), p->comm.id-1,tag+1);
-  comm_isend(&req[0],&p->comm, p->work.ptr,send_n*sizeof(uint), targ,tag+1);
-  comm_wait(req,recvn+1);
-}
-
-void crystal_router(struct crystal *p)
-{
-  uint bl=0, bh, nl;
-  uint id = p->comm.id, n=p->comm.np;
-  uint send_n, targ, tag = 0;
-  int send_hi, recvn;
-  while(n>1) {
-    nl = (n+1)/2, bh = bl+nl;
-    send_hi = id<bh;
-    send_n = crystal_move(p,bh,send_hi);
-    recvn = 1, targ = n-1-(id-bl)+bl;
-    if(id==targ) targ=bh, recvn=0;
-    if(n&1 && id==bh) recvn=2;
-    crystal_exchange(p,send_n,targ,recvn,tag);
-    if(id<bh) n=nl; else n-=nl,bl=bh;
-    tag += 2;
-  }
-}
diff --git a/3rdParty/gslib/src/crystal.h b/3rdParty/gslib/src/crystal.h
deleted file mode 100644
index b6d458299..000000000
--- a/3rdParty/gslib/src/crystal.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef CRYSTAL_H
-#define CRYSTAL_H
-
-#if !defined(COMM_H) || !defined(MEM_H)
-#warning "crystal.h" requires "comm.h" and "mem.h"
-#endif
-
-#define crystal_init   PREFIXED_NAME(crystal_init  )
-#define crystal_free   PREFIXED_NAME(crystal_free  )
-#define crystal_router PREFIXED_NAME(crystal_router)
-
-struct crystal {
-  struct comm comm;
-  buffer data, work;
-};
-
-void crystal_init(struct crystal *cr, const struct comm *comm);
-void crystal_free(struct crystal *cr);
-void crystal_router(struct crystal *cr);
-
-#endif
diff --git a/3rdParty/gslib/src/fail.c b/3rdParty/gslib/src/fail.c
deleted file mode 100644
index 4289a2ea0..000000000
--- a/3rdParty/gslib/src/fail.c
+++ /dev/null
@@ -1,63 +0,0 @@
-#include <stdio.h>  /* sprintf, vfprintf, stdout */
-#include <stdarg.h> /* va_list, va_start, ... */
-#include <stdlib.h> /* exit */
-#include <string.h> /* memcpy, and str* functions in comm_fail */
-#include "name.h"
-#include "fail.h"
-#include "types.h"
-#include "comm.h"
-
-#ifdef USE_USR_EXIT
-#define userExitHandler FORTRAN_NAME(userexithandler,USEREXITHANDLER)
-#define USEREXIT 1
-extern void userExitHandler(int status);
-#else
-#define USEREXIT 0
-void userExitHandler(int status) {};
-#endif
-
-void die(int status)
-{
-  if (USEREXIT) {
-  	userExitHandler(status);
-    	while(1);
-  } else {
-    	exit(status); 
-    	while(1);
-  }
-}
-
-void vdiagnostic(const char *prefix, const char *file, unsigned line,
-                 const char *fmt, va_list ap)
-{
-  static char buf[2048]; int n,na,i=0;
-  sprintf(buf,"%s(proc %04d, %s:%d): ",prefix,(int)comm_gbl_id,file,line);
-  vsprintf(buf+strlen(buf),fmt,ap);
-  strcat(buf,"\n");
-  n=strlen(buf);
-  while(n && (na=fwrite(buf+i,1,n,stdout))) n-=na, i+=na;
-  fflush(stdout);
-}
-
-void diagnostic(const char *prefix, const char *file, unsigned line,
-                const char *fmt, ...)
-{
-  va_list ap; va_start(ap,fmt);
-  vdiagnostic(prefix,file,line,fmt,ap);
-  va_end(ap);
-}
-
-void vfail(int status, const char *file, unsigned line,
-           const char *fmt, va_list ap)
-{
-  vdiagnostic("ERROR ",file,line,fmt,ap);
-  die(status);
-}
-
-void fail(int status, const char *file, unsigned line,
-          const char *fmt, ...)
-{
-  va_list ap; va_start(ap,fmt);
-  vfail(status,file,line,fmt,ap);
-  va_end(ap);
-}
diff --git a/3rdParty/gslib/src/fail.h b/3rdParty/gslib/src/fail.h
deleted file mode 100644
index 018511088..000000000
--- a/3rdParty/gslib/src/fail.h
+++ /dev/null
@@ -1,52 +0,0 @@
-#ifndef FAIL_H
-#define FAIL_H
-
-#if !defined(NAME_H)
-#warning "fail.h" requires "name.h"
-#endif
-
-#define  die        PREFIXED_NAME( die       )
-#define vdiagnostic PREFIXED_NAME(vdiagnostic)
-#define  diagnostic PREFIXED_NAME( diagnostic)
-#define vfail       PREFIXED_NAME(vfail      )
-#define  fail       PREFIXED_NAME( fail      )
-
-#ifdef __GNUC__
-#  define ATTRBD   __attribute__ ((noreturn))
-#  define ATTRB4V  __attribute__ ((format(printf,4,0)))
-#  define ATTRB4   __attribute__ ((format(printf,4,5)))
-#  define ATTRB4DV __attribute__ ((noreturn,format(printf,4,0)))
-#  define ATTRB4D  __attribute__ ((noreturn,format(printf,4,5)))
-#else
-#  define ATTRBD
-#  define ATTRB4V
-#  define ATTRB4
-#  define ATTRB4DV
-#  define ATTRB4D
-#endif
-
-#define DEF_FUNS() \
-  void  die(int status) ATTRBD; \
-  void  diagnostic(const char *prefix, const char *file, unsigned line, \
-                   const char *fmt, ...) ATTRB4  ; \
-  void  fail      (int status,         const char *file, unsigned line, \
-                   const char *fmt, ...) ATTRB4D ;
-#define VDEF_FUNS() \
-  void vdiagnostic(const char *prefix, const char *file, unsigned line, \
-                   const char *fmt, va_list ap) ATTRB4V  ; \
-  void vfail      (int status,         const char *file, unsigned line, \
-                   const char *fmt, va_list ap) ATTRB4DV ;
-DEF_FUNS()
-#ifdef va_arg
-VDEF_FUNS()
-#endif
-
-#undef VDEF_FUNS
-#undef DEF_FUNS
-#undef ATTRB4D
-#undef ATTRB4DV
-#undef ATTRB4
-#undef ATTRB4V
-#undef ATTRBD
-
-#endif
diff --git a/3rdParty/gslib/src/fcrystal.c b/3rdParty/gslib/src/fcrystal.c
deleted file mode 100644
index 3fe4c9ae9..000000000
--- a/3rdParty/gslib/src/fcrystal.c
+++ /dev/null
@@ -1,191 +0,0 @@
-#include <stdio.h>
-#include <stddef.h>
-#include <stdlib.h>
-#include <string.h>
-#include "c99.h"
-#include "name.h"
-#include "fail.h"
-#include "types.h"
-#include "mem.h"
-#include "comm.h"
-#include "crystal.h"
-#include "sort.h"
-#include "sarray_sort.h"
-#include "sarray_transfer.h"
-
-/*--------------------------------------------------------------------------
-
-  FORTRAN Interface to crystal router
-   
-  integer h, np
-  MPI_Comm comm
-  call crystal_setup(h,comm,np)  ! set h to handle to new instance
-  ! it is a runtime error if MPI_Comm_size gives a value different than np
-  call crystal_free(h)         ! release instance
-
-  integer*? ituple(m,max)   ! integer type matching sint from "types.h"
-  call crystal_ituple_transfer(h, ituple,m,n,max, kp)
-    - moves each column ituple(:,i), 1 <= i <= n,
-      to proc ituple(kp,i)
-    - sets n to the number of columns received,
-      which may be larger than max (indicating loss of n-max columns)
-    - also sets ituple(kp,i) to the source proc of column ituple(:,i)
-
-  call crystal_ituple_sort(h, ituple,m,n, key,nkey)
-    - locally sorts columns ituple(:,1...n) in ascending order,
-      ranked by ituple(key(1),i),
-           then ituple(key(2),i),
-           ...
-           then ituple(key(nkey),i)
-    - no communication; h used for scratch area
-    - linear time
-    - assumes nonnegative integers
-
-  integer*? vi(mi,max)   ! integer type matching sint  from "types.h"
-  integer*? vl(ml,max)   ! integer type matching slong from "types.h"
-  real      vr(mr,max)
-  call crystal_tuple_transfer(h,n,max, vi,mi,vl,ml,vr,mr, kp)
-    - moves each column vi(:,i),vl(:,i),vr(:,i) 1 <= i <= n,
-      to proc vi(kp,i)
-    - sets n to the number of columns received,
-      which may be larger than max (indicating loss of n-max columns)
-    - also sets vi(kp,i) to the source proc of columns vi(:,i),vl(:,i),vr(:,i)
-
-  call crystal_tuple_sort(h,n, vi,mi,vl,ml,vr,mr, key,nkey)
-    - locally sorts columns vi/vl/vr (:,1...n) in ascending order,
-      ranked by vi(key(1),i) [ or vl(key(1)-mi,i) if key(1)>mi ],
-           then vi(key(2),i) [ or vl(key(2)-mi,i) if key(2)>mi ],
-           ...
-           then vi(key(nkey),i) or vl(key(nkey)-mi,i)
-    - no communication; h used for scratch area
-    - linear time
-    - assumes nonnegative integers
-    - sorting on reals not yet implemented
-
-  --------------------------------------------------------------------------*/
-
-#undef   crystal_free
-#define ccrystal_free  PREFIXED_NAME(crystal_free)
-
-#define fcrystal_setup           \
-  FORTRAN_NAME(crystal_setup          ,CRYSTAL_SETUP          )
-#define fcrystal_ituple_sort     \
-  FORTRAN_NAME(crystal_ituple_sort    ,CRYSTAL_ITUPLE_SORT    )
-#define fcrystal_tuple_sort      \
-  FORTRAN_NAME(crystal_tuple_sort     ,CRYSTAL_TUPLE_SORT     )
-#define fcrystal_ituple_transfer \
-  FORTRAN_NAME(crystal_ituple_transfer,CRYSTAL_ITUPLE_TRANSFER)
-#define fcrystal_tuple_transfer  \
-  FORTRAN_NAME(crystal_tuple_transfer ,CRYSTAL_TUPLE_TRANSFER )
-#define fcrystal_free            \
-  FORTRAN_NAME(crystal_free           ,CRYSTAL_FREE           )
-
-static struct crystal **handle_array = 0;
-static int handle_max = 0;
-static int handle_n = 0;
-
-void fcrystal_setup(sint *handle, const MPI_Fint *comm, const sint *np)
-{
-  struct crystal *p;
-  if(handle_n==handle_max)
-    handle_max+=handle_max/2+1,
-    handle_array=trealloc(struct crystal*,handle_array,handle_max);
-  handle_array[handle_n]=p=tmalloc(struct crystal,1);
-  comm_init_check(&p->comm, *comm, *np);
-  buffer_init(&p->data,1000);
-  buffer_init(&p->work,1000);
-  *handle = handle_n++;
-}
-
-#define CHECK_HANDLE(func) do \
-  if(*handle<0 || *handle>=handle_n || !handle_array[*handle]) \
-    fail(1,__FILE__,__LINE__,func ": invalid handle"); \
-while(0)
-
-void fcrystal_ituple_sort(const sint *handle,
-                          sint A[], const sint *m, const sint *n,
-                          const sint keys[], const sint *nkey)
-{
-  const size_t size = (*m)*sizeof(sint);
-  sint nk = *nkey;
-  buffer *buf;
-  CHECK_HANDLE("crystal_ituple_sort");
-  buf = &handle_array[*handle]->data;
-  if(--nk>=0) {
-    sortp(buf,0, (uint*)&A[keys[nk]-1],*n,size);
-    while(--nk>=0)
-      sortp(buf,1, (uint*)&A[keys[nk]-1],*n,size);
-    sarray_permute_buf_(ALIGNOF(sint),size,A,*n, buf);
-  }
-}
-
-void fcrystal_tuple_sort(const sint *const handle, const sint *const n,
-                         sint   Ai[], const sint *const mi,
-                         slong  Al[], const sint *const ml,
-                         double Ad[], const sint *const md,
-                         const sint keys[], const sint *const nkey)
-{
-  const size_t size_i = (*mi)*sizeof(sint),
-               size_l = (*ml)*sizeof(slong),
-               size_d = (*md)*sizeof(double);
-  int init=0;
-  sint nk = *nkey;
-  buffer *buf;
-  CHECK_HANDLE("crystal_tuple_sort");
-  buf = &handle_array[*handle]->data;
-  if(nk<=0) return;
-  while(--nk>=0) {
-    sint k = keys[nk]-1;
-    if(k<0 || k>=*mi+*ml)
-      fail(1,__FILE__,__LINE__,"crystal_tuple_sort: invalid key");
-    else if(k<*mi) sortp     (buf,init, (uint *)&Ai[k],    *n,size_i);
-    else           sortp_long(buf,init, (ulong*)&Al[k-*mi],*n,size_l);
-    init=1;
-  }
-  if(*mi) sarray_permute_buf_(ALIGNOF(sint  ),size_i,Ai,*n, buf);
-  if(*ml) sarray_permute_buf_(ALIGNOF(slong ),size_l,Al,*n, buf);
-  if(*md) sarray_permute_buf_(ALIGNOF(double),size_d,Ad,*n, buf);
-}
-
-void fcrystal_ituple_transfer(const sint *handle,
-                              sint A[], const sint *m, sint *n,
-                              const sint *nmax, const sint *proc_key)
-{
-  struct array ar, *const ar_ptr = &ar;
-  const unsigned size=(*m)*sizeof(sint);
-  CHECK_HANDLE("crystal_ituple_transfer");
-  ar.ptr=A, ar.n=*n, ar.max=*nmax;
-  *n = sarray_transfer_many(&ar_ptr,&size,1, 1,0,1,(*proc_key-1)*sizeof(sint),
-         (uint*)&A[*proc_key-1],size, handle_array[*handle]);
-}
-
-void fcrystal_tuple_transfer(
-  const sint *const handle, sint *const n, const sint *const max,
-  sint   Ai[], const sint *const mi,
-  slong  Al[], const sint *const ml,
-  double Ad[], const sint *const md,
-  const sint *const proc_key)
-{
-  struct array ar_i, ar_l, ar_d, *ar[3];
-  unsigned size[3];
-  CHECK_HANDLE("crystal_tuple_transfer");
-  size[0]=*mi*sizeof(sint);
-  size[1]=*ml*sizeof(slong);
-  size[2]=*md*sizeof(double);
-  ar[0]=&ar_i, ar[1]=&ar_l, ar[2]=&ar_d;
-  ar_i.ptr=Ai,ar_l.ptr=Al,ar_d.ptr=Ad;
-  ar_i.n=ar_l.n=ar_d.n = *n;
-  ar_i.max=ar_l.max=ar_d.max=*max;
-  *n = sarray_transfer_many(ar,size,3, 1,0,1,(*proc_key-1)*sizeof(sint),
-         (uint*)&Ai[*proc_key-1],size[0], handle_array[*handle]);
-}
-
-void fcrystal_free(sint *handle)
-{
-  CHECK_HANDLE("crystal_free");
-  ccrystal_free(handle_array[*handle]);
-  free(handle_array[*handle]);
-  handle_array[*handle] = 0;
-}
-
-
diff --git a/3rdParty/gslib/src/findpts.c b/3rdParty/gslib/src/findpts.c
deleted file mode 100644
index 1ed472f36..000000000
--- a/3rdParty/gslib/src/findpts.c
+++ /dev/null
@@ -1,369 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include <math.h>
-#include "c99.h"
-#include "name.h"
-#include "types.h"
-#include "fail.h"
-#include "mem.h"
-#include "poly.h"
-#include "obbox.h"
-#include "findpts_el.h"
-#include "findpts_local.h"
-#include "gs_defs.h"
-#include "comm.h"
-#include "crystal.h"
-#include "sarray_transfer.h"
-#include "sort.h"
-#include "sarray_sort.h"
-/*
-#define DIAGNOSTICS
-*/
-#ifdef DIAGNOSTICS
-#include <stdio.h>
-#endif
-
-#define CODE_INTERNAL 0
-#define CODE_BORDER 1
-#define CODE_NOT_FOUND 2
-
-struct ulong_range { ulong min, max; };
-struct proc_index { uint proc, index; };
-
-static slong lfloor(double x) { return floor(x); }
-static slong lceil (double x) { return ceil (x); }
-
-static ulong hash_index_aux(double low, double fac, ulong n, double x)
-{
-  const slong i = lfloor((x-low)*fac);
-  return i<0 ? 0 : (n-1<(ulong)i ? n-1 : (ulong)i);
-}
-
-static void set_bit(unsigned char *const p, const uint i)
-{
-  const uint byte = i/CHAR_BIT;
-  const unsigned bit = i%CHAR_BIT;
-  p[byte] |= 1u<<bit;
-}
-
-static unsigned get_bit(const unsigned char *const p, const uint i)
-{
-  const uint byte = i/CHAR_BIT;
-  const unsigned bit = i%CHAR_BIT;
-  return p[byte]>>bit & 1u;
-}
-
-static unsigned byte_bits(const unsigned char x)
-{
-  unsigned bit, sum=0;
-  for(bit=0;bit<CHAR_BIT;++bit) sum += x>>bit & 1u;
-  return sum;
-}
-
-static uint count_bits(unsigned char *p, uint n)
-{
-  uint sum=0;
-  for(;n;--n) sum+=byte_bits(*p++);
-  return sum;
-}
-
-#define D 2
-#define WHEN_3D(a)
-#include "findpts_imp.h"
-#undef WHEN_3D
-#undef D
-
-#define D 3
-#define WHEN_3D(a) a
-#include "findpts_imp.h"
-#undef WHEN_3D
-#undef D
-
-/*--------------------------------------------------------------------------
-
-  FORTRAN Interface
-
-  --------------------------------------------------------------------------
-  call findpts_setup(h, comm,np, ndim, xm,ym,zm, nr,ns,nt,nel,
-                     mr,ms,mt, bbox_tol, loc_hash_size, gbl_hash_size,
-                     npt_max, newt_tol)
-
-    (zm,nt,mt all ignored when ndim==2)
-                     
-    h: (output) handle
-    comm,np: MPI communicator and # of procs (checked against MPI_Comm_size)
-    ndim: 2 or 3
-    xm,ym,zm: element geometry (nodal x,y,z values)
-    nr,ns,nt,nel: element dimensions --- e.g., xm(nr,ns,nt,nel)
-  
-    mr,ms,mt: finer mesh size for bounding box computation;
-              must be larger than nr,ns,nt for correctness,
-              recommend at least 2*nr,2*ns,2*nt
-    bbox_tol: e.g., 0.01 - relative size to expand bounding boxes by;
-              prevents points from falling through "cracks",
-              and prevents "not found" failures for points just outside mesh
-                (returning instead the closest point inside the mesh)
-
-    loc_hash_size: e.g., nr*ns*nt*nel
-                   maximum number of integers to use for local geom hash table;
-                   minimum is nel+2 for the trivial table with one cell
-                 
-    gbl_hash_size: e.g., nr*ns*nt*nel
-                   approx number of cells per proc for the distributed
-                     global geometric hash table
-                   NOTE: gbl_hash_size*np needs to fit in a "global" integer
-                         (controlled by -DGLOBAL_LONG or -DGLOBAL_LONG_LONG;
-                          see "types.h")
-                   actual number of cells per proc will be greater by
-                     ~ 3 gbl_hash_size^(2/3) / np^(1/3)
-  
-    npt_max: e.g., 256
-             number of points to iterate on simultaneously
-             enables dominant complexity to be matrix-matrix products
-               (there is a sweet spot --- too high and the cache runs out)
-             the memory allocation term dependent on npt_max is
-               (12 + 2*(nr+ns+nt+nr*ns)) * npt_max     doubles
-  
-    newt_tol: e.g., 1024*DBL_EPSILON
-              the iteration stops for a point when
-                   the 1-norm of the step in (r,s,t) is smaller than newt_tol
-                or the objective (dist^2) increases while the predicted (model)
-                  decrease is smaller than newt_tol * (the objective)
-
-  --------------------------------------------------------------------------
-  call findpts_free(h)
-  
-  --------------------------------------------------------------------------
-  call findpts(h, code_base,  code_stride,
-                  proc_base,  proc_stride,
-                    el_base,    el_stride,
-                     r_base,     r_stride,
-                 dist2_base, dist2_stride,
-                     x_base,     x_stride,
-                     y_base,     y_stride,
-                     z_base,     z_stride, npt)
-
-    (z_base, z_stride ignored when ndim==2)
-
-    conceptually, locates npt points;
-      data for each point is:
-        ouput:
-          code: 0 - inside an element
-                1 - closest point on a border
-                    (perhaps exactly, or maybe just near --- check dist2)
-                2 - not found (bbox_tol controls cut-off between code 1 and 2)
-          proc:    remote processor on which the point was found
-          el:      element on remote processor in which the point was found
-          r(ndim): parametric coordinates for point
-          dist2: distance squared from found to sought point (in xyz space)
-        input:
-          x, y, z: coordinates of sought point
-    
-    the *_base arguments point to the data for the first point,
-      each is advanced by the corresponding *_stride argument for the next point
-    this allows fairly arbitrary data layout,
-      but note the r,s,t coordinates for each point must be packed together
-      (consequently, r_stride must be at least ndim)
-
-
-  --------------------------------------------------------------------------
-  call findpts_eval(h,  out_base,  out_stride,
-                       code_base, code_stride,
-                       proc_base, proc_stride,
-                         el_base,   el_stride,
-                          r_base,    r_stride, npt,
-                    input_field)
-  
-    may be called immediately after findpts (or any other time)
-    to evaluate input_field at the given points ---
-      these specified by code,proc,el,r(ndim) and possibly remote
-    --- storing the interpolated values in out
-          [that is, at out_base(1+out_stride*(point_index-1)) ]
-    
-    for example, following a call to findpts, a call to findpts_eval with
-      input_field = xm, will ideally result in out = x(1) for each point,
-      or x(2) for ym, x(3) for zm
-
-
-  --------------------------------------------------------------------------
-  call findpts_eval_local(h,
-                        out_base,  out_stride,
-                         el_base,   el_stride,
-                          r_base,    r_stride, npt,
-                    input_field)
-
-    just like findpts_eval, but does assumes all points are local,
-    and does no communication. will use matrix-matrix products if
-    points are grouped by element.
-
-  --------------------------------------------------------------------------*/
-
-#define ffindpts_setup      FORTRAN_NAME(findpts_setup     ,FINDPTS_SETUP     )
-#define ffindpts_free       FORTRAN_NAME(findpts_free      ,FINDPTS_FREE      )
-#define ffindpts            FORTRAN_NAME(findpts           ,FINDPTS           )
-#define ffindpts_eval       FORTRAN_NAME(findpts_eval      ,FINDPTS_EVAL      )
-#define ffindpts_eval_local FORTRAN_NAME(findpts_eval_local,FINDPTS_EVAL_LOCAL)
-
-struct handle { void *data; unsigned ndim; };
-static struct handle *handle_array = 0;
-static int handle_max = 0;
-static int handle_n = 0;
-
-void ffindpts_setup(sint *const handle,
-  const MPI_Fint *const comm, const sint *const np,
-  const sint *ndim,
-  const double *const xm, const double *const ym, const double *const zm,
-  const sint *const nr, const sint *const ns, const sint *const nt,
-  const sint *const nel,
-  const sint *const mr, const sint *const ms, const sint *const mt,
-  const double *const bbox_tol,
-  const sint *const loc_hash_size, const sint *const gbl_hash_size,
-  const sint *const npt_max,
-  const double *const newt_tol)
-{
-  struct handle *h;
-  if(handle_n==handle_max)
-    handle_max+=handle_max/2+1,
-    handle_array=trealloc(struct handle,handle_array,handle_max);
-  h = &handle_array[handle_n];
-  h->ndim = *ndim;
-  if(h->ndim==2) {
-    struct findpts_data_2 *const fd = tmalloc(struct findpts_data_2,1);
-    const double *elx[2];
-    uint n[2], m[2];
-    elx[0]=xm,elx[1]=ym;
-    n[0]=*nr,n[1]=*ns;
-    m[0]=*mr,m[1]=*ms;
-    h->data = fd;
-    comm_init_check(&fd->cr.comm, *comm, *np);
-    buffer_init(&fd->cr.data,1000);
-    buffer_init(&fd->cr.work,1000);
-    setup_aux_2(fd, elx,n,*nel,m,*bbox_tol,
-                *loc_hash_size,*gbl_hash_size, *npt_max, *newt_tol);
-  } else if(h->ndim==3) {
-    struct findpts_data_3 *const fd = tmalloc(struct findpts_data_3,1);
-    const double *elx[3];
-    uint n[3], m[3];
-    elx[0]=xm,elx[1]=ym,elx[2]=zm;
-    n[0]=*nr,n[1]=*ns,n[2]=*nt;
-    m[0]=*mr,m[1]=*ms,m[2]=*mt;
-    h->data = fd;
-    comm_init_check(&fd->cr.comm, *comm, *np);
-    buffer_init(&fd->cr.data,1000);
-    buffer_init(&fd->cr.work,1000);
-    setup_aux_3(fd, elx,n,*nel,m,*bbox_tol,
-                *loc_hash_size,*gbl_hash_size, *npt_max, *newt_tol);
-  } else
-    fail(1,__FILE__,__LINE__,
-         "findpts_setup: ndim must be 2 or 3; given ndim=%u",(unsigned)h->ndim);
-  *handle = handle_n++;
-}
-
-#define CHECK_HANDLE(func) \
-  struct handle *h; \
-  if(*handle<0 || *handle>=handle_n || !(h=&handle_array[*handle])->data) \
-    fail(1,__FILE__,__LINE__,func ": invalid handle")
-
-void ffindpts_free(const sint *const handle)
-{
-  CHECK_HANDLE("findpts_free");
-  if(h->ndim==2)
-    PREFIXED_NAME(findpts_free_2)(h->data);
-  else
-    PREFIXED_NAME(findpts_free_3)(h->data);
-  h->data = 0;
-}
-
-void ffindpts(const sint *const handle,
-          sint *const  code_base, const sint *const  code_stride,
-          sint *const  proc_base, const sint *const  proc_stride,
-          sint *const    el_base, const sint *const    el_stride,
-        double *const     r_base, const sint *const     r_stride,
-        double *const dist2_base, const sint *const dist2_stride,
-  const double *const     x_base, const sint *const     x_stride,
-  const double *const     y_base, const sint *const     y_stride,
-  const double *const     z_base, const sint *const     z_stride,
-  const sint *const npt)
-{
-  CHECK_HANDLE("findpts");
-  if(h->ndim==2) {
-    const double *xv_base[2];
-    unsigned xv_stride[2];
-    xv_base[0]=x_base, xv_base[1]=y_base;
-    xv_stride[0] = *x_stride*sizeof(double),
-    xv_stride[1] = *y_stride*sizeof(double);
-    PREFIXED_NAME(findpts_2)(
-      (uint*)code_base,(* code_stride)*sizeof(sint  ),
-      (uint*)proc_base,(* proc_stride)*sizeof(sint  ),
-      (uint*)  el_base,(*   el_stride)*sizeof(sint  ),
-                r_base,(*    r_stride)*sizeof(double),
-            dist2_base,(*dist2_stride)*sizeof(double),
-               xv_base,     xv_stride,
-      *npt, h->data);
-  } else {
-    const double *xv_base[3];
-    unsigned xv_stride[3];
-    xv_base[0]=x_base, xv_base[1]=y_base, xv_base[2]=z_base;
-    xv_stride[0] = *x_stride*sizeof(double),
-    xv_stride[1] = *y_stride*sizeof(double),
-    xv_stride[2] = *z_stride*sizeof(double);
-    PREFIXED_NAME(findpts_3)(
-      (uint*)code_base,(* code_stride)*sizeof(sint  ),
-      (uint*)proc_base,(* proc_stride)*sizeof(sint  ),
-      (uint*)  el_base,(*   el_stride)*sizeof(sint  ),
-                r_base,(*    r_stride)*sizeof(double),
-            dist2_base,(*dist2_stride)*sizeof(double),
-               xv_base,     xv_stride,
-      *npt, h->data);
-  }
-}
-
-void ffindpts_eval(const sint *const handle,
-        double *const  out_base, const sint *const  out_stride,
-  const   sint *const code_base, const sint *const code_stride,
-  const   sint *const proc_base, const sint *const proc_stride,
-  const   sint *const   el_base, const sint *const   el_stride,
-  const double *const    r_base, const sint *const    r_stride,
-  const sint *const npt, const double *const in)
-{
-  CHECK_HANDLE("findpts_eval");
-  if(h->ndim==2)
-    PREFIXED_NAME(findpts_eval_2)(
-              out_base,(* out_stride)*sizeof(double),
-      (uint*)code_base,(*code_stride)*sizeof(sint  ),
-      (uint*)proc_base,(*proc_stride)*sizeof(sint  ),
-      (uint*)  el_base,(*  el_stride)*sizeof(sint  ),
-                r_base,(*   r_stride)*sizeof(double),
-      *npt, in, h->data);
-  else
-    PREFIXED_NAME(findpts_eval_3)(
-              out_base,(* out_stride)*sizeof(double),
-      (uint*)code_base,(*code_stride)*sizeof(sint  ),
-      (uint*)proc_base,(*proc_stride)*sizeof(sint  ),
-      (uint*)  el_base,(*  el_stride)*sizeof(sint  ),
-                r_base,(*   r_stride)*sizeof(double),
-      *npt, in, h->data);
-}
-
-void ffindpts_eval_local(const sint *const handle,
-        double *const  out_base, const sint *const  out_stride,
-  const   sint *const   el_base, const sint *const   el_stride,
-  const double *const    r_base, const sint *const    r_stride,
-  const sint *const npt, const double *const in)
-{
-  CHECK_HANDLE("findpts_eval_local");
-  if(h->ndim==2)
-    findpts_local_eval_2(
-              out_base,(* out_stride)*sizeof(double),
-      (uint*)  el_base,(*  el_stride)*sizeof(sint  ),
-                r_base,(*   r_stride)*sizeof(double),
-      *npt, in, &((struct findpts_data_2 *)h->data)->local);
-  else
-    findpts_local_eval_3(
-              out_base,(* out_stride)*sizeof(double),
-      (uint*)  el_base,(*  el_stride)*sizeof(sint  ),
-                r_base,(*   r_stride)*sizeof(double),
-      *npt, in, &((struct findpts_data_3 *)h->data)->local);
-}
diff --git a/3rdParty/gslib/src/findpts.h b/3rdParty/gslib/src/findpts.h
deleted file mode 100644
index 16846917c..000000000
--- a/3rdParty/gslib/src/findpts.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#ifndef FINDPTS_H
-#define FINDPTS_H
-
-#if !defined(COMM_H)
-#warning "findpts.h" requires "comm.h"
-#endif
-
-#define findpts_setup_2   PREFIXED_NAME(findpts_setup_2)
-#define findpts_free_2    PREFIXED_NAME(findpts_free_2 )
-#define findpts_2         PREFIXED_NAME(findpts_2      )
-#define findpts_eval_2    PREFIXED_NAME(findpts_eval_2 )
-#define findpts_setup_3   PREFIXED_NAME(findpts_setup_3)
-#define findpts_free_3    PREFIXED_NAME(findpts_free_3 )
-#define findpts_3         PREFIXED_NAME(findpts_3      )
-#define findpts_eval_3    PREFIXED_NAME(findpts_eval_3 )
-
-struct findpts_data_2;
-struct findpts_data_3;
-
-struct findpts_data_2 *findpts_setup_2(
-  const struct comm *const comm,
-  const double *const elx[2],
-  const unsigned n[2], const uint nel,
-  const unsigned m[2], const double bbox_tol,
-  const uint local_hash_size, const uint global_hash_size,
-  const unsigned npt_max, const double newt_tol);
-
-struct findpts_data_3 *findpts_setup_3(
-  const struct comm *const comm,
-  const double *const elx[3],
-  const unsigned n[3], const uint nel,
-  const unsigned m[3], const double bbox_tol,
-  const uint local_hash_size, const uint global_hash_size,
-  const unsigned npt_max, const double newt_tol);
-
-void findpts_free_2(struct findpts_data_2 *fd);
-void findpts_free_3(struct findpts_data_3 *fd);
-
-void findpts_2(    uint   *const  code_base   , const unsigned  code_stride   ,
-                   uint   *const  proc_base   , const unsigned  proc_stride   ,
-                   uint   *const    el_base   , const unsigned    el_stride   ,
-                   double *const     r_base   , const unsigned     r_stride   ,
-                   double *const dist2_base   , const unsigned dist2_stride   ,
-             const double *const     x_base[2], const unsigned     x_stride[2],
-             const uint npt, struct findpts_data_2 *const fd);
-
-void findpts_3(    uint   *const  code_base   , const unsigned  code_stride   ,
-                   uint   *const  proc_base   , const unsigned  proc_stride   ,
-                   uint   *const    el_base   , const unsigned    el_stride   ,
-                   double *const     r_base   , const unsigned     r_stride   ,
-                   double *const dist2_base   , const unsigned dist2_stride   ,
-             const double *const     x_base[3], const unsigned     x_stride[3],
-             const uint npt, struct findpts_data_3 *const fd);
-
-void findpts_eval_2(
-        double *const  out_base, const unsigned  out_stride,
-  const uint   *const code_base, const unsigned code_stride,
-  const uint   *const proc_base, const unsigned proc_stride,
-  const uint   *const   el_base, const unsigned   el_stride,
-  const double *const    r_base, const unsigned    r_stride,
-  const uint npt,
-  const double *const in, struct findpts_data_2 *const fd);
- 
-void findpts_eval_3(
-        double *const  out_base, const unsigned  out_stride,
-  const uint   *const code_base, const unsigned code_stride,
-  const uint   *const proc_base, const unsigned proc_stride,
-  const uint   *const   el_base, const unsigned   el_stride,
-  const double *const    r_base, const unsigned    r_stride,
-  const uint npt,
-  const double *const in, struct findpts_data_3 *const fd);
-
-#endif
diff --git a/3rdParty/gslib/src/findpts_el.h b/3rdParty/gslib/src/findpts_el.h
deleted file mode 100644
index 4ed119aae..000000000
--- a/3rdParty/gslib/src/findpts_el.h
+++ /dev/null
@@ -1,122 +0,0 @@
-#ifndef FINDPTS_EL_H
-#define FINDPTS_EL_H
-
-#if !defined(NAME_H) || !defined(POLY_H)
-#warning "findpts_el.h" requires "name.h", "poly.h"
-#endif
-
-#define findpts_el_setup_2   PREFIXED_NAME(findpts_el_setup_2)
-#define findpts_el_free_2    PREFIXED_NAME(findpts_el_free_2 )
-#define findpts_el_2         PREFIXED_NAME(findpts_el_2      )
-#define findpts_el_eval_2    PREFIXED_NAME(findpts_el_eval_2 )
-
-struct findpts_el_pt_2 {
-  double x[2],r[2],oldr[2],dist2,dist2p,tr;
-  unsigned index,flags;
-};
-
-struct findpts_el_gedge_2 { const double *x[2], *dxdn[2]; };
-struct findpts_el_gpt_2   { double x[2], jac[4], hes[4]; };
-
-struct findpts_el_data_2 {
-  unsigned npt_max;
-  struct findpts_el_pt_2 *p;
-
-  unsigned n[2];
-  double *z[2];
-  lagrange_fun *lag[2];
-  double *lag_data[2];
-  double *wtend[2];
-  
-  const double *x[2];
-  
-  unsigned side_init;
-  double *sides;
-  struct findpts_el_gedge_2 edge[4]; /* R S=-1; R S=1; ... */
-  struct findpts_el_gpt_2 pt[4];
-
-  double *work;
-};
-
-void findpts_el_setup_2(struct findpts_el_data_2 *const fd,
-                        const unsigned n[2],
-                        const unsigned npt_max);
-void findpts_el_free_2(struct findpts_el_data_2 *const fd);
-void findpts_el_2(struct findpts_el_data_2 *fd, unsigned npt, const double tol);
-void findpts_el_eval_2(
-        double *const out_base, const unsigned out_stride,
-  const double *const   r_base, const unsigned   r_stride, const unsigned pn,
-  const double *const in, struct findpts_el_data_2 *const fd);
-
-static void findpts_el_start_2(struct findpts_el_data_2 *const fd,
-                               const double *const x[2])
-{
-  fd->side_init=0,fd->x[0]=x[0],fd->x[1]=x[1];
-}
-
-static struct findpts_el_pt_2 *findpts_el_points_2(
-  struct findpts_el_data_2 *const fd)
-{
-  return fd->p;
-}
-
-#define findpts_el_setup_3   PREFIXED_NAME(findpts_el_setup_3)
-#define findpts_el_free_3    PREFIXED_NAME(findpts_el_free_3 )
-#define findpts_el_3         PREFIXED_NAME(findpts_el_3      )
-#define findpts_el_eval_3    PREFIXED_NAME(findpts_el_eval_3 )
-
-struct findpts_el_pt_3 {
-  double x[3],r[3],oldr[3],dist2,dist2p,tr;
-  unsigned index,flags;
-};
-
-struct findpts_el_gface_3 { const double *x[3], *dxdn[3]; };
-struct findpts_el_gedge_3 { const double *x[3], *dxdn1[3], *dxdn2[3],
-                                         *d2xdn1[3], *d2xdn2[3]; };
-struct findpts_el_gpt_3   { double x[3], jac[9], hes[18]; };
-
-struct findpts_el_data_3 {
-  unsigned npt_max;
-  struct findpts_el_pt_3 *p;
-
-  unsigned n[3];
-  double *z[3];
-  lagrange_fun *lag[3];
-  double *lag_data[3];
-  double *wtend[3];
-  
-  const double *x[3];
-  
-  unsigned side_init;
-  double *sides;
-  struct findpts_el_gface_3 face[6]; /* ST R=-1,R=+1; TR S=-1,S=+1; ... */
-  struct findpts_el_gedge_3 edge[12]; /* R S=-1,T=-1; R S=1,T=-1; ... */
-  struct findpts_el_gpt_3 pt[8];
-
-  double *work;
-};
-
-void findpts_el_setup_3(struct findpts_el_data_3 *const fd,
-                        const unsigned n[3],
-                        const unsigned npt_max);
-void findpts_el_free_3(struct findpts_el_data_3 *const fd);
-void findpts_el_3(struct findpts_el_data_3 *const fd, const unsigned npt,
-                  const double tol);
-void findpts_el_eval_3(
-        double *const out_base, const unsigned out_stride,
-  const double *const   r_base, const unsigned   r_stride, const unsigned pn,
-  const double *const in, struct findpts_el_data_3 *const fd);
-
-static void findpts_el_start_3(struct findpts_el_data_3 *const fd,
-                               const double *const x[3])
-{
-  fd->side_init=0,fd->x[0]=x[0],fd->x[1]=x[1],fd->x[2]=x[2];
-}
-
-static struct findpts_el_pt_3 *findpts_el_points_3(
-  struct findpts_el_data_3 *const fd)
-{
-  return fd->p;
-}
-
-#endif
diff --git a/3rdParty/gslib/src/findpts_el_2.c b/3rdParty/gslib/src/findpts_el_2.c
deleted file mode 100644
index 9d6eca601..000000000
--- a/3rdParty/gslib/src/findpts_el_2.c
+++ /dev/null
@@ -1,819 +0,0 @@
-#include <stdio.h>
-
-#include <stddef.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <float.h>
-#include "c99.h"
-#include "name.h"
-#include "fail.h"
-#include "types.h"
-#include "mem.h"
-#include "tensor.h"
-#include "poly.h"
-
-#define findpts_el_setup_2   PREFIXED_NAME(findpts_el_setup_2)
-#define findpts_el_free_2    PREFIXED_NAME(findpts_el_free_2 )
-#define findpts_el_2         PREFIXED_NAME(findpts_el_2      )
-#define findpts_el_eval_2    PREFIXED_NAME(findpts_el_eval_2 )
-/*
-#define DIAGNOSTICS_1
-#define DIAGNOSTICS_2
-*/
-#define DIAGNOSTICS_ITERATIONS 0
-
-#if defined(DIAGNOSTICS_1) || defined(DIAGNOSTICS_2) \
-    || DIAGNOSTICS_ITERATIONS > 0
-#include <stdio.h>
-#endif
-
-/* A is row-major */
-static void lin_solve_2(double x[2], const double A[4], const double y[2])
-{
-  const double idet = 1/(A[0]*A[3] - A[1]*A[2]);
-  x[0] = idet*(A[3]*y[0] - A[1]*y[1]);
-  x[1] = idet*(A[0]*y[1] - A[2]*y[0]);
-}
-
-struct findpts_el_pt_2 {
-  double x[2],r[2],oldr[2],dist2,dist2p,tr;
-  unsigned index,flags;
-};
-
-/* the bit structure of flags is CSSRR
-   the C bit --- 1<<4 --- is set when the point is converged
-   RR is 0 = 00b if r is unconstrained,
-         1 = 01b if r is constrained at -1
-         2 = 10b if r is constrained at +1
-   SS is similarly for s constraints
-*/
-
-#define CONVERGED_FLAG (1u<<4)
-#define FLAG_MASK 0x1fu
-
-static unsigned num_constrained(const unsigned flags)
-{
-  const unsigned y = flags | flags>>1;
-  return (y&1u) + (y>>2 & 1u);
-}
-
-static unsigned pt_flags_to_bin_noC(const unsigned flags)
-{
-  return (flags>>2 & 3u)*3 + (flags & 3u);
-}
-
-/* map flags to 9 if the C bit is set,
-   else to [0,8] --- the 9 valid configs of SSRR */
-static unsigned pt_flags_to_bin(const unsigned flags)
-{
-  const unsigned mask = 0u - (flags>>4); /* 0 or 0xfff... when converged */
-  return (mask & 9u) | (~mask & pt_flags_to_bin_noC(flags));
-}
-
-/* assumes x = 0, or 1  */
-static unsigned plus_1_mod_2(const unsigned x) { return x^1u; }
-
-/* assumes x = 1 << i, with i < 4, returns i+1 */
-static unsigned which_bit(const unsigned x)
-{
-  const unsigned y = x&7u;
-  return (y-(y>>2)) | ((x-1)&4u);
-}
-
-static unsigned edge_index(const unsigned x) { return which_bit(x)-1; }
-
-static unsigned point_index(const unsigned x)
-{
-  return ((x>>1)&1u) | ((x>>2)&2u);
-}
-
-/* extra data
-
-  we need x, dx/dn for each edge
-    r: x at 0, nrs - nr,
-      4*nr extra for dx/dn
-    s: 8*ns extra
-
-*/
-
-struct findpts_el_gedge_2 { const double *x[2], *dxdn[2]; };
-struct findpts_el_gpt_2   { double x[2], jac[4], hes[4]; };
-
-struct findpts_el_data_2 {
-  unsigned npt_max;
-  struct findpts_el_pt_2 *p;
-
-  unsigned n[2];
-  double *z[2];
-  lagrange_fun *lag[2];
-  double *lag_data[2];
-  double *wtend[2];
-
-  const double *x[2];
-
-  unsigned side_init;
-  double *sides;
-  struct findpts_el_gedge_2 edge[4]; /* R=-1 S; R=1 S; ... */
-  struct findpts_el_gpt_2 pt[4];
-
-  double *work;
-};
-
-/* work[2*(nr+ns)] */
-/* work[4*(nr+ns)] */
-/* work[6*(nr+6)] */
-/* work[(6+2*(2*nr+ns))*pn] */
-/* work[(10+3*n)*pn] */
-static unsigned work_size(
-  const unsigned nr, const unsigned ns, const unsigned npt_max)
-{
-  const unsigned n = ns>nr?ns:nr;
-  unsigned wsize;
-  #define DO_MAX(x) do { const unsigned temp=(x); \
-                         wsize=temp>wsize?temp:wsize; } while(0)
-  wsize = (6 + 2*(2*nr+ns)) * npt_max;
-  DO_MAX(4*(nr+ns));
-  DO_MAX(6*(nr+6));
-  DO_MAX(npt_max*(10+3*n));
-  #undef DO_MAX
-  return wsize;
-}
-
-void findpts_el_setup_2(struct findpts_el_data_2 *const fd,
-                        const unsigned n[2],
-                        const unsigned npt_max)
-{
-  const unsigned nr=n[0], ns=n[1];
-  const unsigned tot = 8*ns + 4*nr;
-  unsigned d,i, lag_size[2];
-
-  fd->npt_max = npt_max;
-  fd->p = tmalloc(struct findpts_el_pt_2, npt_max*2);
-
-  fd->n[0]=nr, fd->n[1]=ns;
-  for(d=0;d<2;++d) lag_size[d] = gll_lag_size(fd->n[d]);
-
-  fd->z[0]        = tmalloc(double,lag_size[0]+lag_size[1]
-                                   +7*(nr+ns) + tot +
-                                   work_size(nr,ns,npt_max));
-  fd->z[1]        = fd->z[0]+nr;
-  fd->lag_data[0] = fd->z[1]+ns;
-  fd->lag_data[1] = fd->lag_data[0]+lag_size[0];
-  fd->wtend[0]    = fd->lag_data[1]+lag_size[1];
-  fd->wtend[1]    = fd->wtend[0]+6*nr;
-  fd->sides       = fd->wtend[1]+6*ns;
-  fd->work        = fd->sides + tot;
-
-  fd->side_init = 0;
-
-  for(d=0;d<2;++d) {
-    double *wt=fd->wtend[d]; unsigned nn=fd->n[d];
-    lobatto_nodes(fd->z[d],nn);
-    fd->lag[d] = gll_lag_setup(fd->lag_data[d],nn);
-    fd->lag[d](wt    , fd->lag_data[d],nn,2,-1);
-    fd->lag[d](wt+3*nn, fd->lag_data[d],nn,2, 1);
-
-    wt[0]=1; for(i=1;i<nn;++i) wt[i]=0;
-    wt+=3*nn; { for(i=0;i<nn-1;++i) wt[i]=0; } wt[i]=1;
-  }
-
-  for(d=0;d<2;++d)
-    fd->edge[0].x[d]    = fd->sides +    d *ns, \
-    fd->edge[0].dxdn[d] = fd->sides + (2+d)*ns, \
-    fd->edge[1].x[d]    = fd->sides + (4+d)*ns, \
-    fd->edge[1].dxdn[d] = fd->sides + (6+d)*ns; \
-
-  for(d=0;d<2;++d)
-    fd->edge[2].x[d] = 0, /* will point to user data */
-    fd->edge[2].dxdn[d] = fd->sides + 8*ns +    d *nr,
-    fd->edge[3].x[d] = 0, /* will point to user data */
-    fd->edge[3].dxdn[d] = fd->sides + 8*ns + (2+d)*nr;
-}
-
-void findpts_el_free_2(struct findpts_el_data_2 *const fd)
-{
-  free(fd->p);
-  free(fd->z[0]);
-}
-
-typedef void compute_edge_data_fun(struct findpts_el_data_2 *fd);
-
-/* work[2*(nr+ns)] */
-static void compute_edge_data_r(struct findpts_el_data_2 *fd)
-{
-  const unsigned nr = fd->n[0], ns=fd->n[1], nrsm1 = nr*(ns-1);
-  unsigned d;
-  double *work = fd->work, *out = fd->sides + 8*ns;
-  memcpy(work   , fd->wtend[1]+  ns, ns*sizeof(double));
-  memcpy(work+ns, fd->wtend[1]+4*ns, ns*sizeof(double));
-  for(d=0;d<2;++d) {
-    tensor_mxm(work+2*ns,nr, fd->x[d],ns, work,2);
-    memcpy(out+   d *nr, work+2*ns      , nr*sizeof(double));
-    memcpy(out+(2+d)*nr, work+2*ns+nr   , nr*sizeof(double));
-    fd->edge[2].x[d] = fd->x[d];
-    fd->edge[3].x[d] = fd->x[d] + nrsm1;
-  }
-}
-
-/* work[4*(nr+ns)] */
-static void compute_edge_data_s(struct findpts_el_data_2 *fd)
-{
-  const unsigned nr = fd->n[0], ns=fd->n[1];
-  unsigned d;
-  double *work = fd->work, *out = fd->sides;
-  memcpy(work     , fd->wtend[0]     , 2*nr*sizeof(double));
-  memcpy(work+2*nr, fd->wtend[0]+3*nr, 2*nr*sizeof(double));
-  for(d=0;d<2;++d) {
-    tensor_mtxm(work+4*nr,ns, fd->x[d],nr, work,4);
-    memcpy(out+   d *ns, work+4*nr     , ns*sizeof(double));
-    memcpy(out+(2+d)*ns, work+4*nr+  ns, ns*sizeof(double));
-    memcpy(out+(4+d)*ns, work+4*nr+2*ns, ns*sizeof(double));
-    memcpy(out+(6+d)*ns, work+4*nr+3*ns, ns*sizeof(double));
-  }
-}
-
-static const struct findpts_el_gedge_2 *get_edge(
-  struct findpts_el_data_2 *fd, unsigned ei)
-{
-  const unsigned mask = 1u<<(ei/2);
-  if((fd->side_init&mask)==0) {
-    compute_edge_data_fun *const fun[2] = {
-      compute_edge_data_s,
-      compute_edge_data_r
-    };
-    fun[ei/2](fd);
-    fd->side_init |= mask;
-  }
-  return &fd->edge[ei];
-}
-
-/* work[6*(nr+6)] */
-static void compute_pt_data(struct findpts_el_data_2 *fd)
-{
-  const unsigned nr = fd->n[0], ns = fd->n[1];
-  double *work = fd->work, *work2 = work+6*nr;
-  unsigned d,i,j;
-  for(d=0;d<2;++d) {
-    tensor_mxm(work,nr, fd->x[d],ns, fd->wtend[1],6);
-    tensor_mtxm(work2,6, fd->wtend[0],nr, work,6);
-    for(j=0;j<2;++j) for(i=0;i<2;++i) {
-      fd->pt[2*j+i].x[d]       = work2[6*(3*j+0)+(3*i+0)];
-      fd->pt[2*j+i].jac[2*d+0] = work2[6*(3*j+0)+(3*i+1)];
-      fd->pt[2*j+i].jac[2*d+1] = work2[6*(3*j+1)+(3*i+0)];
-      fd->pt[2*j+i].hes[2*d+0] = work2[6*(3*j+0)+(3*i+2)];
-      fd->pt[2*j+i].hes[2*d+1] = work2[6*(3*j+2)+(3*i+0)];
-    }
-  }
-}
-
-static const struct findpts_el_gpt_2 *get_pt(
-  struct findpts_el_data_2 *fd, unsigned pi)
-{
-  if((fd->side_init&4u)==0)
-    compute_pt_data(fd), fd->side_init |= 4u;
-  return &fd->pt[pi];
-}
-
-/* check reduction in objective against prediction, and adjust
-   trust region radius (p->tr) accordingly;
-   may reject the prior step, returning 1; otherwise returns 0
-   sets out->dist2, out->index, out->x, out->oldr in any event,
-   leaving out->r, out->dr, out->flags to be set when returning 0 */
-static int reject_prior_step_q(struct findpts_el_pt_2 *const out,
-                               const double resid[2],
-                               const struct findpts_el_pt_2 *const p,
-                               const double tol)
-{
-  const double old_dist2 = p->dist2;
-  const double dist2 = resid[0]*resid[0]+resid[1]*resid[1];
-  const double decr = old_dist2-dist2;
-  const double pred = p->dist2p;
-  out->x[0]=p->x[0],out->x[1]=p->x[1];
-  out->oldr[0]=p->r[0],out->oldr[1]=p->r[1];
-  out->index=p->index;
-  out->dist2=dist2;
-#ifdef DIAGNOSTICS_2
-  printf("Checking prior step:\n"
-         "       old r = (%.17g,%.17g), old flags = %x\n"
-         "   old_dist2 = %.17g\n"
-         "           r = (%.17g,%.17g),     flags = %x\n"
-         "       dist2 = %.17g\n"
-         "  difference = %.17g\n"
-         "   predicted = %.17g\n"
-         "         rho = %.17g\n",
-         p->oldr[0],p->oldr[1],(p->flags>>5)&FLAG_MASK,old_dist2,
-         p->r[0],p->r[1],p->flags&FLAG_MASK,dist2,
-         decr, pred, decr/pred);
-#endif
-  if(decr>= 0.01 * pred) {
-    if(decr>= 0.9 * pred) {
-      out->tr = p->tr*2;
-#ifdef DIAGNOSTICS_2
-      printf("  very good iteration; tr -> %g\n", out->tr);
-#endif
-    } else {
-#ifdef DIAGNOSTICS_2
-      printf("  good iteration; tr = %g\n", p->tr);
-#endif
-      out->tr = p->tr;
-    }
-    return 0;
-  } else {
-    /* reject step; note: the point will pass through this routine
-       again, and we set things up here so it gets classed as a
-       "very good iteration" --- this doubles the trust radius,
-       which is why we divide by 4 below */
-    double v0 = fabs(p->r[0]-p->oldr[0]),
-           v1 = fabs(p->r[1]-p->oldr[1]);
-    out->tr = (v0>v1?v0:v1)/4;
-#ifdef DIAGNOSTICS_2
-    printf("  bad iteration; tr -> %g\n", out->tr);
-#endif
-    out->dist2=old_dist2;
-    out->r[0]=p->oldr[0],out->r[1]=p->oldr[1];
-    out->flags=p->flags>>5;
-    out->dist2p=-DBL_MAX;
-    if(pred < dist2*tol) out->flags|=CONVERGED_FLAG;
-    return 1;
-  }
-}
-
-/* minimize ||resid - jac * dr||_2, with |dr| <= tr, |r0+dr|<=1
-   (exact solution of trust region problem) */
-static void newton_area(struct findpts_el_pt_2 *const out,
-                        const double jac[4], const double resid[2],
-                        const struct findpts_el_pt_2 *const p, const double tol)
-{
-  const double tr = p->tr;
-  double bnd[4] = { -1,1, -1,1 };
-  double r0[2];
-  double dr[2], fac;
-  unsigned d, mask, flags;
-
-  r0[0] = p->r[0], r0[1] = p->r[1];
-
-#ifdef DIAGNOSTICS_1
-  printf("newton_area:\n");
-  printf("  resid = (%g,%g); r^T r / 2 = %g\n",resid[0],resid[1],
-         (resid[0]*resid[0]+resid[1]*resid[1])/2);
-  printf("  jac = %g\t%g\n"
-         "        %g\t%g\n",
-         jac[0],jac[1],jac[2],jac[3]);
-  printf("  r = (%.17g,%.17g)\n",r0[0],r0[1]);
-#endif
-
-  mask = 0xfu;
-  for(d=0;d<2;++d) {
-    if(r0[d]-tr>-1) bnd[2*d  ]=r0[d]-tr, mask^=1u<<(2*d);
-    if(r0[d]+tr< 1) bnd[2*d+1]=r0[d]+tr, mask^=2u<<(2*d);
-  }
-
-  lin_solve_2(dr, jac,resid);
-
-#ifdef DIAGNOSTICS_1
-  printf("  min at r = (%.17g,%.17g)\n", r0[0]+dr[0],r0[1]+dr[1]);
-#endif
-
-  fac = 1, flags = 0;
-  for(d=0;d<2;++d) {
-    double nr = r0[d]+dr[d];
-    if((nr-bnd[2*d])*(bnd[2*d+1]-nr)>=0) continue;
-    if(nr<bnd[2*d]) {
-      double f = (bnd[2*d  ]-r0[d])/dr[d];
-      if(f<fac) fac=f, flags = 1u<<(2*d);
-    } else {
-      double f = (bnd[2*d+1]-r0[d])/dr[d];
-      if(f<fac) fac=f, flags = 2u<<(2*d);
-    }
-  }
-
-#ifdef DIAGNOSTICS_1
-  printf("  flags = %x, fac = %.15g\n",flags,fac);
-#endif
-
-  if(flags==0) goto newton_area_fin;
-
-  for(d=0;d<2;++d) dr[d]*=fac;
-
-  newton_area_edge: {
-    const unsigned ei = edge_index(flags);
-    const unsigned dn = ei>>1, de = plus_1_mod_2(dn);
-    const double res0 = resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]),
-                 res1 = resid[1]-(jac[2]*dr[0]+jac[3]*dr[1]);
-    /* y = J_u^T res */
-    const double y = jac[de]*res0+jac[2+de]*res1;
-    /* JtJ = J_u^T J_u */
-    const double JtJ = jac[  de]*jac[  de]
-                      +jac[2+de]*jac[2+de];
-    const double drc = y/JtJ;
-    double ffac = 1;
-    unsigned new_flags = 0;
-#ifdef DIAGNOSTICS_1
-    printf("  edge %u, de=%u\n",ei,de);
-    printf("    r=(%.17g,%.17g)\n", r0[0]+dr[0],r0[1]+dr[1]);
-    printf("    resid = (%g,%g); r^T r / 2 = %g\n",res0,res1,
-           (res[0]*res[0]+res[1]*res[1])/2);
-    printf("    min at %.17g\n", r0[de]+dr[de]+drc);
-#endif
-    {
-      const double rz = r0[de]+dr[de], lb=bnd[2*de],ub=bnd[2*de+1];
-      const double nr = r0[de]+(dr[de]+drc);
-      if((nr-lb)*(ub-nr)<0) {
-        if(nr<lb) {
-          double f = (lb-rz)/drc;
-          if(f<ffac) ffac=f, new_flags = 1u<<(2*de);
-        } else {
-          double f = (ub-rz)/drc;
-          if(f<ffac) ffac=f, new_flags = 2u<<(2*de);
-        }
-      }
-    }
-#ifdef DIAGNOSTICS_1
-    printf("    new_flags = %x, ffac = %.17g\n",new_flags,ffac);
-#endif
-    dr[de] += ffac*drc;
-    flags |= new_flags;
-    goto newton_area_relax;
-  }
-
-  /* check and possibly relax constraints */
-  newton_area_relax: {
-    const unsigned old_flags = flags;
-    /* res := res_0 - J dr */
-    const double res0 = resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]),
-                 res1 = resid[1]-(jac[2]*dr[0]+jac[3]*dr[1]);
-    /* y := J^T res */
-    double y[2]; y[0] = jac[0]*res0+jac[2]*res1,
-                 y[1] = jac[1]*res0+jac[3]*res1;
-    #define SETDR(d) do { \
-      unsigned f = flags>>(2*d) & 3u; \
-      if(f) dr[d] = bnd[2*d+(f-1)] - r0[d]; \
-    } while(0)
-    SETDR(0); SETDR(1);
-    #undef SETDR
-    for(d=0;d<2;++d) {
-      unsigned c = flags>>(2*d) & 3u;
-      if(c==0) continue;
-      else if(dr[d]*y[d]<0) flags &= ~(3u<<(2*d));
-#ifdef DIAGNOSTICS_1
-      if( (c==1&&dr[d]>0) || (c==2&&dr[d]<0) )
-        printf("FAIL! c=%u, dr[d]=%g\n",c,dr[d]);
-#endif
-    }
-#ifdef DIAGNOSTICS_1
-    printf("  checking constraints (%x)\n",old_flags);
-    printf("    r=(%.17g,%.17g)\n", r0[0]+dr[0],r0[1]+dr[1]);
-    printf("    resid = (%g,%g); r^T r / 2 = %g\n",res[0],res[1],
-           (res[0]*res[0]+res[1]*res[1])/2);
-    printf("    relaxed %x -> %x\n",old_flags,flags);
-#endif
-    if(flags==old_flags) goto newton_area_fin;
-    switch(num_constrained(flags)) {
-      case 1: goto newton_area_edge;
-    }
-  }
-
-newton_area_fin:
-#ifdef DIAGNOSTICS_1
-  {
-    const double res[2]={ resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]),
-                          resid[1]-(jac[2]*dr[0]+jac[3]*dr[1]) };
-    printf("  r=(%.17g,%.17g)\n", r0[0]+dr[0],r0[1]+dr[1]);
-    printf("  resid = (%g,%g); r^T r / 2 = %g\n",res[0],res[1],
-           (res[0]*res[0]+res[1]*res[1])/2);
-  }
-#endif
-  flags &= mask;
-  if(fabs(dr[0])+fabs(dr[1]) < tol) flags |= CONVERGED_FLAG;
-  {
-    const double res0 = resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]),
-                 res1 = resid[1]-(jac[2]*dr[0]+jac[3]*dr[1]);
-    out->dist2p=resid[0]*resid[0]+resid[1]*resid[1]
-                -(res0*res0+res1*res1);
-  }
-  #define SETR(d) do { \
-    unsigned f = flags>>(2*d) & 3u; \
-    out->r[d] = f==0 ? r0[d]+dr[d] : ( f==1 ? -1 : 1 ); \
-  } while(0)
-  SETR(0); SETR(1);
-  #undef SETR
-  out->flags = flags | (p->flags<<5);
-}
-
-static void newton_edge(struct findpts_el_pt_2 *const out,
-  const double jac[4], const double rhes, const double resid[2],
-  const unsigned de, const unsigned dn,
-  unsigned flags,
-  const struct findpts_el_pt_2 *const p, const double tol)
-{
-  const double tr = p->tr;
-  /* A = J^T J - resid_d H_d */
-  const double A = jac[  de]*jac[  de]
-                  +jac[2+de]*jac[2+de] - rhes;
-  /* y = J^T r */
-  const double y = jac[  de]*resid[0]
-                  +jac[2+de]*resid[1];
-
-  const double oldr = p->r[de];
-  double dr,nr,tdr,tnr;
-  double v,tv; unsigned new_flags=0, tnew_flags=0;
-
-#ifdef DIAGNOSTICS_1
-  printf("Newton edge %u (dn=%u) flags=%x\n",de,dn,flags);
-  printf("  A=%g, y=%g\n",A,y);
-  if(A<=0) printf("  A not positive\n");
-  printf("  r=(%.17g,%.17g)\n",p->r[0],p->r[1]);
-#endif
-
-  #define EVAL(dr) (dr*A-2*y)*dr
-
-  /* if A is not SPD, quadratic model has no minimum */
-  if(A>0) {
-    dr = y/A, nr = oldr+dr;
-    if(fabs(dr)<tr && fabs(nr)<1) { v=EVAL(dr); goto newton_edge_fin; }
-  }
-
-  if(( nr=oldr-tr)>-1)  dr=-tr;
-  else                  nr=-1,  dr=-1-oldr,  new_flags = flags | 1u<<(2*de);
-  v =EVAL( dr);
-
-  if((tnr=oldr+tr)< 1) tdr=tr;
-  else                 tnr= 1, tdr= 1-oldr, tnew_flags = flags | 2u<<(2*de);
-  tv=EVAL(tdr);
-
-  if(tv<v) nr=tnr, dr=tdr, v=tv, new_flags=tnew_flags;
-
-newton_edge_fin:
-  /* check convergence */
-  if(fabs(dr) < tol) new_flags |= CONVERGED_FLAG;
-  out->r[de]=nr;
-  out->r[dn]=p->r[dn];
-  out->dist2p = -v;
-  out->flags = flags | new_flags | (p->flags<<5);
-#ifdef DIAGNOSTICS_1
-  printf("  new r = (%.17g,%.17g)\n",out->r[0],out->r[1]);
-#endif
-}
-
-typedef void findpt_fun(
-  struct findpts_el_pt_2 *const out,
-  struct findpts_el_data_2 *const fd,
-  const struct findpts_el_pt_2 *const p, const unsigned pn, const double tol);
-
-/* work[(6+2*(2*nr+ns))*pn] */
-static void findpt_area(
-  struct findpts_el_pt_2 *const out,
-  struct findpts_el_data_2 *const fd,
-  const struct findpts_el_pt_2 *const p, const unsigned pn, const double tol)
-{
-  const unsigned nr=fd->n[0],ns=fd->n[1];
-  double *const resid = fd->work, *const jac = resid + 2*pn,
-         *const wtr = jac+4*pn, *const wts = wtr+2*nr*pn,
-         *const slice = wts+2*ns*pn;
-  unsigned i; unsigned d;
-  /* evaluate x(r) and jacobian */
-  for(i=0;i<pn;++i)
-    fd->lag[0](wtr+2*i*nr, fd->lag_data[0], nr, 1, p[i].r[0]);
-  for(i=0;i<pn;++i)
-    fd->lag[1](wts+2*i*ns, fd->lag_data[1], ns, 1, p[i].r[1]);
-  for(d=0;d<2;++d) {
-    tensor_mxm(slice,nr, fd->x[d],ns, wts,2*pn);
-    for(i=0;i<pn;++i) {
-      const double *const wtr_i = wtr+2*i*nr, *const slice_i = slice+2*i*nr;
-      double *const jac_i = jac+4*i+2*d;
-      resid[2*i+d] = p[i].x[d] - tensor_ig1(jac_i,
-        wtr_i,nr, slice_i);
-      jac_i[1] = tensor_i1(wtr_i,nr, slice_i+nr);
-    }
-  }
-  /* perform Newton step */
-  for(i=0;i<pn;++i) {
-    if(reject_prior_step_q(out+i,resid+2*i,p+i,tol)) continue;
-    else newton_area(out+i, jac+4*i, resid+2*i, p+i, tol);
-  }
-}
-
-/* work[(10+3*n)*pn] */
-static void findpt_edge(
-  struct findpts_el_pt_2 *const out,
-  struct findpts_el_data_2 *const fd,
-  const struct findpts_el_pt_2 *const p, const unsigned pn, const double tol)
-{
-  const unsigned pflag = p->flags & FLAG_MASK;
-  const unsigned ei = edge_index(pflag);
-  const unsigned dn = ei>>1, de = plus_1_mod_2(dn);
-  const unsigned n = fd->n[de];
-  double *const resid=fd->work, *const jac=resid+2*pn, *const hes=jac+4*pn,
-         *const wt = hes+pn, *const slice = wt+3*n*pn;
-  const struct findpts_el_gedge_2 *const edge = get_edge(fd,ei);
-  unsigned i; unsigned d;
-
-#ifdef DIAGNOSTICS_1
-  printf("Edge %u\n",ei);
-  printf("  pflag = %u\n",pflag);
-  printf("  ei = %u\n",ei);
-  printf("  dn, de = %u, %u\n",dn,de);
-  printf("  n = %u \n", n);
-#endif
-
-  /* evaluate x(r), jacobian, hessian */
-  for(i=0;i<pn;++i)
-    fd->lag[de](wt+3*i*n, fd->lag_data[de], n, 2, p[i].r[de]);
-  for(i=0;i<pn;++i) hes[i]=0;
-  for(d=0;d<2;++d) {
-    tensor_mtxv(slice,3*pn, wt, edge->x[d],n);
-    for(i=0;i<pn;++i) {
-      const double *const slice_i = slice+3*i;
-      double r;
-      resid[2*i+d] = r = p[i].x[d] - slice_i[0];
-      jac[4*i+2*d+de] = slice_i[1];
-      hes[i] += r * slice_i[2];
-    }
-  }
-  for(i=1;i<pn;++i) memcpy(wt+i*n, wt+3*i*n, n*sizeof(double));
-  for(d=0;d<2;++d) {
-    tensor_mtxv(slice,pn, wt, edge->dxdn[d],n);
-    for(i=0;i<pn;++i) jac[4*i+2*d+dn] = slice[i];
-  }
-  /* perform Newton step */
-  for(i=0;i<pn;++i) {
-    double *const resid_i=resid+2*i, *const jac_i=jac+4*i, *const hes_i=hes+i;
-    /* check prior step */
-    if(!reject_prior_step_q(out+i,resid_i,p+i,tol)) {
-      /* check constraint */
-      const double steep = resid_i[0] * jac_i[  dn]
-                          +resid_i[1] * jac_i[2+dn];
-#ifdef DIAGNOSTICS_1
-      printf("jacobian = %g\t%g\n"
-             "           %g\t%g\n",jac_i[0],jac_i[1],jac_i[2],jac_i[3]);
-      printf("resid_i = (%g,%g)\n", resid_i[0],resid_i[1]);
-      printf("steep = %g (%s)\n", steep, steep * p[i].r[dn] < 0 ? "in" : "out");
-#endif
-      if(steep * p[i].r[dn] < 0) /* relax constraint */
-        newton_area(out+i, jac_i, resid_i, p+i, tol);
-      else
-        newton_edge(out+i, jac_i, *hes_i, resid_i, de,dn,pflag, p+i, tol);
-    }
-  }
-}
-
-static void findpt_pt(
-  struct findpts_el_pt_2 *const out,
-  struct findpts_el_data_2 *const fd,
-  const struct findpts_el_pt_2 *const p, const unsigned pn, const double tol)
-{
-  const unsigned pflag = p->flags & FLAG_MASK;
-  const unsigned pi = point_index(pflag);
-  const struct findpts_el_gpt_2 *gpt = get_pt(fd,pi);
-  const double *const x = gpt->x, *const jac = gpt->jac, *const hes = gpt->hes;
-  unsigned i;
-
-#ifdef DIAGNOSTICS_1
-  printf("Point %u\n",pi);
-  printf("  pflag = %u\n",pflag);
-  printf("  pi = %u\n",pi);
-#endif
-
-  for(i=0;i<pn;++i) {
-    double resid[2], steep[2], sr[2];
-    unsigned dn,de;
-    resid[0] = p[i].x[0]-x[0],
-    resid[1] = p[i].x[1]-x[1];
-    steep[0] = jac[0]*resid[0] + jac[2]*resid[1],
-    steep[1] = jac[1]*resid[0] + jac[3]*resid[1];
-    sr[0] = steep[0]*p[i].r[0],
-    sr[1] = steep[1]*p[i].r[1];
-    /* check prior step */
-    if(reject_prior_step_q(out+i,resid,p+i,tol)) continue;
-    /* check constraints */
-    if(sr[0]<0) {
-      if(sr[1]<0) goto findpt_pt_area;
-      else { de=0,dn=1; goto findpt_pt_edge; }
-    }
-    else if(sr[1]<0) { de=1,dn=0; goto findpt_pt_edge; }
-    out[i].r[0]=p[i].r[0],out[i].r[1]=p[i].r[1];
-    out[i].dist2p=0;
-    out[i].flags = pflag | CONVERGED_FLAG;
-    continue;
-    findpt_pt_area:
-      newton_area(out+i, jac,resid, p+i, tol);
-      continue;
-    findpt_pt_edge: {
-      const double rh = resid[0]*hes[de]+resid[1]*hes[2+de];
-      newton_edge(out+i, jac,rh,resid, de,dn,
-                  pflag&(3u<<(2*dn)), p+i, tol);
-    } continue;
-  }
-}
-
-static void seed(struct findpts_el_data_2 *const fd,
-                 struct findpts_el_pt_2 *const pt, const unsigned npt)
-{
-  struct findpts_el_pt_2 *p, *const pe = pt+npt;
-  const unsigned nr=fd->n[0], ns=fd->n[1];
-  unsigned i,j, ii=0;
-  for(p=pt;p!=pe;++p) p->dist2=DBL_MAX;
-  for(j=0;j<ns;++j) {
-    const double zs=fd->z[1][j];
-    for(i=0;i<nr;++i) {
-      const double zr=fd->z[0][i];
-      const double x=fd->x[0][ii], y=fd->x[1][ii];
-      ++ii;
-      for(p=pt;p!=pe;++p) {
-        const double dx=p->x[0]-x,dy=p->x[1]-y;
-        const double dist2 = dx*dx+dy*dy;
-        if(p->dist2<=dist2) continue;
-        p->dist2=dist2;
-        p->r[0]=zr, p->r[1]=zs;
-      }
-    }
-  }
-}
-
-void findpts_el_2(struct findpts_el_data_2 *const fd, const unsigned npt,
-                  const double tol)
-{
-  findpt_fun *const fun[3] =
-    { &findpt_area, &findpt_edge, &findpt_pt };
-  struct findpts_el_pt_2 *const pbuf = fd->p, *const pstart = fd->p + npt;
-  unsigned nconv = npt;
-  unsigned step = 0;
-  unsigned count[9] = { 0,0,0, 0,0,0, 0,0,0 } ;
-  count[0]=npt;
-  seed(fd,pbuf,npt);
-  { unsigned i;
-    for(i=0;i<npt;++i) {
-      pstart[i].x[0]=pbuf[i].x[0];
-      pstart[i].x[1]=pbuf[i].x[1];
-      pstart[i].r[0]=pbuf[i].r[0];
-      pstart[i].r[1]=pbuf[i].r[1];
-      pstart[i].index=i,pstart[i].flags=0;
-      pstart[i].dist2=DBL_MAX,pstart[i].dist2p=0,pstart[i].tr=1;
-    }
-  }
-  while(nconv && step++ < 50) {
-    /* advance each group of points */
-    struct findpts_el_pt_2 *p, *const pe=pstart+nconv, *pout; unsigned pn;
-
-#if DIAGNOSTICS_ITERATIONS>1
-    { unsigned i;
-      printf("findpts_el_2 Newton step (%u), %u unconverged:\n ", step,nconv);
-      for(i=0;i<9;++i) printf(" %u",count[i]);
-      printf("\n");
-    }
-#endif
-
-    for(p=pstart,pout=pbuf; p!=pe; p+=pn,pout+=pn) {
-      const unsigned pflags = p->flags & FLAG_MASK;
-      pn = count[pt_flags_to_bin_noC(pflags)];
-      fun[num_constrained(pflags)](pout, fd, p,pn, tol);
-    }
-    /* group points by contsraints */
-    {
-      unsigned offset[10] = { 0,0,0, 0,0,0, 0,0,0, 0 };
-      struct findpts_el_pt_2 *const ppe = pbuf+nconv;
-      for(pout=pbuf; pout!=ppe; ++pout)
-        ++offset[pt_flags_to_bin(pout->flags & FLAG_MASK)];
-      {
-        unsigned i; unsigned sum=0;
-        for(i=0;i<9;++i) {
-          unsigned ci=offset[i]; count[i]=ci, offset[i]=sum, sum+=ci;
-        }
-        nconv = offset[9] = sum; /* last bin is converged; forget it */
-      }
-      for(pout=pbuf; pout!=ppe; ++pout)
-        pstart[offset[pt_flags_to_bin(pout->flags & FLAG_MASK)]++] = *pout;
-    }
-  }
-  { struct findpts_el_pt_2 *p, *const pe=pstart+npt;
-    for(p=pstart;p!=pe;++p)
-      pbuf[p->index]=*p, pbuf[p->index].flags&=FLAG_MASK;
-  }
-#if DIAGNOSTICS_ITERATIONS
-  printf("findpts_el_2 took %u steps\n ", step);
-#endif
-}
-
-void findpts_el_eval_2(
-        double *const out_base, const unsigned out_stride,
-  const double *const   r_base, const unsigned   r_stride, const unsigned pn,
-  const double *const in, struct findpts_el_data_2 *const fd)
-{
-  const unsigned nr=fd->n[0],ns=fd->n[1];
-  double *const wtr = fd->work, *const wts = wtr+nr*pn,
-         *const slice = wts+ns*pn;
-  unsigned i; const double *r; double *out;
-  for(i=0,r=r_base;i<pn;++i) {
-    fd->lag[0](wtr+i*nr, fd->lag_data[0], nr, 0, r[0]);
-    fd->lag[1](wts+i*ns, fd->lag_data[1], ns, 0, r[1]);
-    r = (const double*)((const char*)r + r_stride);
-  }
-
-  tensor_mxm(slice,nr, in,ns, wts,pn);
-  for(i=0,out=out_base;i<pn;++i) {
-    const double *const wtr_i = wtr+i*nr, *const slice_i = slice+i*nr;
-    *out = tensor_i1(wtr_i,nr, slice_i);
-    out = (double*)((char*)out + out_stride);
-  }
-}
diff --git a/3rdParty/gslib/src/findpts_el_3.c b/3rdParty/gslib/src/findpts_el_3.c
deleted file mode 100644
index 6948f60f5..000000000
--- a/3rdParty/gslib/src/findpts_el_3.c
+++ /dev/null
@@ -1,1318 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <float.h>
-#include "c99.h"
-#include "name.h"
-#include "fail.h"
-#include "types.h"
-#include "mem.h"
-#include "tensor.h"
-#include "poly.h"
-
-#define findpts_el_setup_3   PREFIXED_NAME(findpts_el_setup_3)
-#define findpts_el_free_3    PREFIXED_NAME(findpts_el_free_3 )
-#define findpts_el_3         PREFIXED_NAME(findpts_el_3      )
-#define findpts_el_eval_3    PREFIXED_NAME(findpts_el_eval_3 )
-/*
-#define DIAGNOSTICS_1
-#define DIAGNOSTICS_2
-*/
-#define DIAGNOSTICS_ITERATIONS 0
-
-#if defined(DIAGNOSTICS_1) || defined(DIAGNOSTICS_2) \
-    || DIAGNOSTICS_ITERATIONS > 0
-#include <stdio.h>
-#endif
-
-/* A is row-major */
-static void lin_solve_3(double x[3], const double A[9], const double y[3])
-{
-  const double a = A[4]*A[8]-A[5]*A[7],
-               b = A[5]*A[6]-A[3]*A[8],
-               c = A[3]*A[7]-A[4]*A[6],
-            idet = 1/(A[0]*a+A[1]*b+A[2]*c);
-  const double
-    inv0 = a,
-    inv1 = A[2]*A[7]-A[1]*A[8],
-    inv2 = A[1]*A[5]-A[2]*A[4],
-    inv3 = b,
-    inv4 = A[0]*A[8]-A[2]*A[6],
-    inv5 = A[2]*A[3]-A[0]*A[5],
-    inv6 = c,
-    inv7 = A[1]*A[6]-A[0]*A[7],
-    inv8 = A[0]*A[4]-A[1]*A[3];
-  x[0] = idet*(inv0*y[0] + inv1*y[1] + inv2*y[2]);
-  x[1] = idet*(inv3*y[0] + inv4*y[1] + inv5*y[2]);
-  x[2] = idet*(inv6*y[0] + inv7*y[1] + inv8*y[2]);
-}
-
-static void lin_solve_sym_2(double x[2], const double A[3], const double y[2])
-{
-  const double idet = 1/(A[0]*A[2] - A[1]*A[1]);
-  x[0] = idet*(A[2]*y[0] - A[1]*y[1]);
-  x[1] = idet*(A[0]*y[1] - A[1]*y[0]);
-}
-
-
-struct findpts_el_pt_3 {
-  double x[3],r[3],oldr[3],dist2,dist2p,tr;
-  unsigned index,flags;
-};
-
-/* the bit structure of flags is CTTSSRR
-   the C bit --- 1<<6 --- is set when the point is converged
-   RR is 0 = 00b if r is unconstrained,
-         1 = 01b if r is constrained at -1
-         2 = 10b if r is constrained at +1
-   SS, TT are similarly for s and t constraints
-*/
-
-#define CONVERGED_FLAG (1u<<6)
-#define FLAG_MASK 0x7fu
-
-static unsigned num_constrained(const unsigned flags)
-{
-  const unsigned y = flags | flags>>1;
-  return (y&1u) + (y>>2 & 1u) + (y>>4 & 1u);
-}
-
-static unsigned pt_flags_to_bin_noC(const unsigned flags)
-{
-  return ((flags>>4 & 3u)*3 + (flags>>2 & 3u))*3 + (flags & 3u);
-}
-
-/* map flags to 27 if the C bit is set,
-   else to [0,26] --- the 27 valid configs of TTSSRR */
-static unsigned pt_flags_to_bin(const unsigned flags)
-{
-  const unsigned mask = 0u - (flags>>6); /* 0 or 0xfff... when converged */
-  return (mask & 27u) | (~mask & pt_flags_to_bin_noC(flags));
-}
-
-/* assumes x = 0, 1, or 2 */
-static unsigned plus_1_mod_3(const unsigned x) { return ((x | x>>1)+1) & 3u; }
-static unsigned plus_2_mod_3(const unsigned x)
-{
-  const unsigned y = (x-1) & 3u;
-  return y ^ (y>>1);
-}
-
-/* assumes x = 1 << i, with i < 6, returns i+1 */
-static unsigned which_bit(const unsigned x)
-{
-  const unsigned y = x&7u;
-  return (y-(y>>2)) | ((x-1)&4u) | (x>>4);
-}
-
-static unsigned face_index(const unsigned x) { return which_bit(x)-1; }
-
-static unsigned edge_index(const unsigned x)
-{
-  const unsigned y = ~((x>>1) | x);
-  const unsigned RTSR = ((x>>1)&1u) | ((x>>2)&2u) | ((x>>3)&4u) | ((x<<2)&8u);
-  const unsigned re = RTSR>>1;
-  const unsigned se = 4u | RTSR>>2;
-  const unsigned te = 8u | (RTSR&3u);
-  return   ( (0u - ( y    &1u)) & re )
-         | ( (0u - ((y>>2)&1u)) & se )
-         | ( (0u - ((y>>4)&1u)) & te );
-}
-
-static unsigned point_index(const unsigned x)
-{
-  return ((x>>1)&1u) | ((x>>2)&2u) | ((x>>3)&4u);
-}
-
-/* extra data
-
-  we need x, dx/dn for each face
-    rs: x at 0, nrst - nrs,
-      6*nrs extra for dx/dn
-    st: 12*nst extra
-    tr: 12*ntr extra
-      (transposed order for embedded t-edges)
-
-  for each edge,
-    have x, dx/dn2 already as part of face data
-    need dx/dn1 (strided in face data)
-    need d^2x/dn1^2, d^2x/dn2^2 possibly, if constraints relax
-      thats 3*4*(nr+ns+nt) extra
-
-*/
-
-struct findpts_el_gface_3 { const double *x[3], *dxdn[3]; };
-struct findpts_el_gedge_3 { const double *x[3], *dxdn1[3], *dxdn2[3],
-                                         *d2xdn1[3], *d2xdn2[3]; };
-struct findpts_el_gpt_3   { double x[3], jac[9], hes[18]; };
-
-struct findpts_el_data_3 {
-  unsigned npt_max;
-  struct findpts_el_pt_3 *p;
-
-  unsigned n[3];
-  double *z[3];
-  lagrange_fun *lag[3];
-  double *lag_data[3];
-  double *wtend[3];
-
-  const double *x[3];
-
-  unsigned side_init;
-  double *sides;
-  struct findpts_el_gface_3 face[6]; /* ST R=-1,R=+1; TR S=-1,S=+1; ... */
-  struct findpts_el_gedge_3 edge[12]; /* R S=-1,T=-1; R S=1,T=-1; ... */
-  struct findpts_el_gpt_3 pt[8];
-
-  double *work;
-};
-
-/* work[2*nt+2*nrs] */
-/* work[4*nr+4*nst] */
-/* work[4*ns+4*nr] */
-/* work[4*n1+4*n], work[2*n2+2*n] */
-/* work[4*nr+4], work[2*nt+2] */
-/* work[(3+9+2*(nr+ns+nt+nrs))*pn + max(2*nr,ns) ] */
-/* work[(3+9+3+3*(n1+n2+n1))*pn ] */
-/* work[ 3*n ] */
-static unsigned work_size(
-  const unsigned nr, const unsigned ns, const unsigned nt,
-  const unsigned npt_max)
-{
-  unsigned n1, n2, wsize;
-  if(nr>ns) {
-    if(nr>nt) n1=nr, n2 = (ns>nt ? ns : nt);
-    else      n1=nt, n2 = nr;
-  } else {
-    if(ns>nt) n1=ns, n2 = (nr>nt ? nr : nt);
-    else      n1=nt, n2 = ns;
-  }
-  #define DO_MAX(x) do { const unsigned temp=(x); \
-                         wsize=temp>wsize?temp:wsize; } while(0)
-  wsize = (12 + 2*(nr+ns+nt+nr*ns)) * npt_max + (2*nr>ns?2*nr:ns);
-  DO_MAX(2*(nt+nr*ns));
-  DO_MAX(4*(nr+ns*nt));
-  DO_MAX(4*(n1+n2));
-  DO_MAX(npt_max*(15+3*(2*n1+n2)));
-  #undef DO_MAX
-  return wsize;
-}
-
-void findpts_el_setup_3(struct findpts_el_data_3 *const fd,
-                        const unsigned n[3],
-                        const unsigned npt_max)
-{
-  const unsigned nr=n[0], ns=n[1], nt=n[2];
-  const unsigned nrs = nr*ns, nst=ns*nt, ntr=nt*nr;
-  const unsigned face_size = 12*nst + 12*ntr + 6*nrs;
-  const unsigned off_es = face_size + 36*nr, off_et = off_es + 36*ns,
-                 tot = off_et + 36*nt;
-  unsigned d,i, lag_size[3];
-
-  fd->npt_max = npt_max;
-  fd->p = tmalloc(struct findpts_el_pt_3, npt_max*2);
-
-  fd->n[0]=nr, fd->n[1]=ns, fd->n[2]=nt;
-  for(d=0;d<3;++d) lag_size[d] = gll_lag_size(fd->n[d]);
-
-  fd->z[0]        = tmalloc(double,lag_size[0]+lag_size[1]+lag_size[2]
-                                   +7*(nr+ns+nt) + tot +
-                                   work_size(nr,ns,nt,npt_max));
-  fd->z[1]        = fd->z[0]+nr;
-  fd->z[2]        = fd->z[1]+ns;
-  fd->lag_data[0] = fd->z[2]+nt;
-  fd->lag_data[1] = fd->lag_data[0]+lag_size[0];
-  fd->lag_data[2] = fd->lag_data[1]+lag_size[1];
-  fd->wtend[0]    = fd->lag_data[2]+lag_size[2];
-  fd->wtend[1]    = fd->wtend[0]+6*nr;
-  fd->wtend[2]    = fd->wtend[1]+6*ns;
-  fd->sides       = fd->wtend[2]+6*nt;
-  fd->work        = fd->sides + tot;
-
-  fd->side_init = 0;
-
-  for(d=0;d<3;++d) {
-    double *wt=fd->wtend[d]; unsigned nn=fd->n[d];
-    lobatto_nodes(fd->z[d],nn);
-    fd->lag[d] = gll_lag_setup(fd->lag_data[d],nn);
-    fd->lag[d](wt    , fd->lag_data[d],nn,2,-1);
-    fd->lag[d](wt+3*nn, fd->lag_data[d],nn,2, 1);
-
-    wt[0]=1; for(i=1;i<nn;++i) wt[i]=0;
-    wt+=3*nn; { for(i=0;i<nn-1;++i) wt[i]=0; } wt[i]=1;
-  }
-
-  #define SET_FACE(i,base,n) do { for(d=0;d<3;++d) \
-    fd->face[2*i  ].x[d]    = fd->sides + base +    d *n, \
-    fd->face[2*i  ].dxdn[d] = fd->sides + base + (3+d)*n, \
-    fd->face[2*i+1].x[d]    = fd->sides + base + (6+d)*n, \
-    fd->face[2*i+1].dxdn[d] = fd->sides + base + (9+d)*n; \
-  } while(0)
-  SET_FACE(0,0,nst);
-  SET_FACE(1,12*nst,ntr);
-  #undef SET_FACE
-
-  for(d=0;d<3;++d)
-    fd->face[4].x[d] = 0, /* will point to user data */
-    fd->face[4].dxdn[d] = fd->sides + 12*(nst+ntr) + d*nrs,
-    fd->face[5].x[d] = 0, /* will point to user data */
-    fd->face[5].dxdn[d] = fd->sides + 12*(nst+ntr) + (3+d)*nrs;
-
-  #define SET_EDGE1(j,k,d,rd,rn,base) \
-    for(i=0;i<2;++i) \
-      fd->edge[4*j+2*i+0].dxdn2[d] = fd->face[2*k+i].dxdn[d], \
-      fd->edge[4*j+2*i+1].dxdn2[d] = fd->face[2*k+i].dxdn[d]+n##rd##rn-n##rd;
-  #define SET_EDGE2(j,d,rd,rn,base) \
-    for(i=0;i<4;++i) \
-      fd->edge[4*j+i].dxdn1 [d] = fd->sides + base + (9*i  +d)*n##rd, \
-      fd->edge[4*j+i].d2xdn1[d] = fd->sides + base + (9*i+3+d)*n##rd, \
-      fd->edge[4*j+i].d2xdn2[d] = fd->sides + base + (9*i+6+d)*n##rd;
-  #define SET_EDGE(j,rd,rn,base) do { \
-    for(d=0;d<3;++d) { SET_EDGE1(j,plus_2_mod_3(j),d,rd,rn,base); \
-                       SET_EDGE2(j,d,rd,rn,base); } \
-  } while(0)
-  SET_EDGE(0,r,s,face_size);
-  SET_EDGE(1,s,t,off_es);
-  SET_EDGE(2,t,r,off_et);
-  #undef SET_EDGE
-  #undef SET_EDGE2
-  #undef SET_EDGE1
-}
-
-void findpts_el_free_3(struct findpts_el_data_3 *const fd)
-{
-  free(fd->p);
-  free(fd->z[0]);
-}
-
-typedef void compute_face_data_fun(struct findpts_el_data_3 *fd);
-
-/* work[2*nt+2*nrs] */
-static void compute_face_data_rs(struct findpts_el_data_3 *fd)
-{
-  const unsigned nr = fd->n[0], ns=fd->n[1], nt=fd->n[2],
-                 nrs = nr*ns, nst=ns*nt, ntr = nt*nr, nrstm1 = nrs*(nt-1);
-  unsigned d;
-  double *work = fd->work, *out = fd->sides + 12*(nst+ntr);
-  memcpy(work   , fd->wtend[2]+  nt, nt*sizeof(double));
-  memcpy(work+nt, fd->wtend[2]+4*nt, nt*sizeof(double));
-  for(d=0;d<3;++d) {
-    tensor_mxm(work+2*nt,nrs, fd->x[d],nt, work,2);
-    memcpy(out+   d *nrs, work+2*nt       , nrs*sizeof(double));
-    memcpy(out+(3+d)*nrs, work+2*nt+nrs   , nrs*sizeof(double));
-    fd->face[4].x[d] = fd->x[d];
-    fd->face[5].x[d] = fd->x[d] + nrstm1;
-  }
-}
-
-/* work[4*nr+4*nst] */
-static void compute_face_data_st(struct findpts_el_data_3 *fd)
-{
-  const unsigned nr = fd->n[0], ns=fd->n[1], nt=fd->n[2], nst=ns*nt;
-  unsigned i;
-  double *work = fd->work, *out = fd->sides;
-  memcpy(work     , fd->wtend[0]     , 2*nr*sizeof(double));
-  memcpy(work+2*nr, fd->wtend[0]+3*nr, 2*nr*sizeof(double));
-  for(i=0;i<3;++i) {
-    tensor_mtxm(work+4*nr,nst, fd->x[i],nr, work,4);
-    memcpy(out+   i *nst, work+4*nr      , nst*sizeof(double));
-    memcpy(out+(3+i)*nst, work+4*nr+  nst, nst*sizeof(double));
-    memcpy(out+(6+i)*nst, work+4*nr+2*nst, nst*sizeof(double));
-    memcpy(out+(9+i)*nst, work+4*nr+3*nst, nst*sizeof(double));
-  }
-}
-
-/* work[4*ns+4*nr] */
-static void compute_face_data_tr(struct findpts_el_data_3 *fd)
-{
-  const unsigned nr = fd->n[0], ns=fd->n[1], nt=fd->n[2],
-                 nrs = nr*ns, nst=ns*nt, ntr=nt*nr;
-  unsigned i,k,d;
-  double *work = fd->work, *out = fd->sides + 12*nst;
-  memcpy(work     , fd->wtend[1]     , 2*ns*sizeof(double));
-  memcpy(work+2*ns, fd->wtend[1]+3*ns, 2*ns*sizeof(double));
-  for(d=0;d<3;++d) {
-    for(k=0;k<nt;++k) {
-      double *outk; double *in = work+4*ns;
-      tensor_mxm(in,nr, fd->x[d]+k*nrs,ns, work,4);
-      for(outk=out+   d *ntr+k,i=0;i<nr;++i,outk+=nt) *outk=*in++;
-      for(outk=out+(3+d)*ntr+k,i=0;i<nr;++i,outk+=nt) *outk=*in++;
-      for(outk=out+(6+d)*ntr+k,i=0;i<nr;++i,outk+=nt) *outk=*in++;
-      for(outk=out+(9+d)*ntr+k,i=0;i<nr;++i,outk+=nt) *outk=*in++;
-    }
-  }
-}
-
-static const struct findpts_el_gface_3 *get_face(
-  struct findpts_el_data_3 *fd, unsigned fi)
-{
-  const unsigned mask = 1u<<(fi/2);
-  if((fd->side_init&mask)==0) {
-    compute_face_data_fun *const fun[3] = {
-      compute_face_data_st,
-      compute_face_data_tr,
-      compute_face_data_rs
-    };
-    fun[fi/2](fd);
-    fd->side_init |= mask;
-  }
-  return &fd->face[fi];
-}
-
-/* work[4*n1+4*n], work[2*n2+2*n] */
-static void compute_edge_data(struct findpts_el_data_3 *fd, unsigned d)
-{
-  const unsigned dn1 = plus_1_mod_3(d), dn2 = plus_2_mod_3(d);
-  const unsigned n = fd->n[d], n1 = fd->n[dn1], n2 = fd->n[dn2];
-  const unsigned nr=fd->n[0],ns=fd->n[1],nt=fd->n[2],
-                 nrs=nr*ns,nst=ns*nt,ntr=nt*nr;
-  const unsigned base = 6*nrs + 12*nst + 12*ntr
-                        + (d>0 ? 36*nr : 0) + (d>1 ? 36*ns : 0);
-  #define DXDN1(i,d)  (fd->sides+base+(9*(i)  +(d))*n)
-  #define D2XDN1(i,d) (fd->sides+base+(9*(i)+3+(d))*n)
-  #define D2XDN2(i,d) (fd->sides+base+(9*(i)+6+(d))*n)
-  const struct findpts_el_gface_3 *face_d_n1 = get_face(fd,2*dn2),
-                                  *face_n2_d = get_face(fd,2*dn1);
-  struct findpts_el_gedge_3 *e = fd->edge + 4*d;
-  unsigned i,xd;
-  double *work = fd->work;
-  for(xd=0;xd<3;++xd) for(i=0;i<2;++i)
-    e[2*i  ].x[xd] = face_d_n1[i].x[xd],
-    e[2*i+1].x[xd] = face_d_n1[i].x[xd]+n*(n1-1);
-  memcpy(work     , fd->wtend[dn1]+  n1,2*n1*sizeof(double));
-  memcpy(work+2*n1, fd->wtend[dn1]+4*n1,2*n1*sizeof(double));
-  for(i=0;i<2;++i) for(xd=0;xd<3;++xd) {
-    tensor_mxm(work+4*n1,n, face_d_n1[i].x[xd],n1, work,4);
-    memcpy( DXDN1(2*i+0,xd), work+4*n1    , n*sizeof(double));
-    memcpy(D2XDN1(2*i+0,xd), work+4*n1+  n, n*sizeof(double));
-    memcpy( DXDN1(2*i+1,xd), work+4*n1+2*n, n*sizeof(double));
-    memcpy(D2XDN1(2*i+1,xd), work+4*n1+3*n, n*sizeof(double));
-  }
-  memcpy(work   , fd->wtend[dn2]+2*n2,n2*sizeof(double));
-  memcpy(work+n2, fd->wtend[dn2]+5*n2,n2*sizeof(double));
-  for(i=0;i<2;++i) for(xd=0;xd<3;++xd) {
-    tensor_mtxm(work+2*n2,n, face_n2_d[i].x[xd],n2, work,2);
-    memcpy(D2XDN2(  i,xd), work+2*n2  , n*sizeof(double));
-    memcpy(D2XDN2(2+i,xd), work+2*n2+n, n*sizeof(double));
-  }
-  #undef D2XDN2
-  #undef D2XDN1
-  #undef DXDN1
-}
-
-static const struct findpts_el_gedge_3 *get_edge(
-  struct findpts_el_data_3 *fd, unsigned ei)
-{
-  const unsigned mask = 8u<<(ei/4);
-  if((fd->side_init&mask)==0)
-    compute_edge_data(fd,ei/4), fd->side_init |= mask;
-  return &fd->edge[ei];
-}
-
-/* work[4*nr+4], work[2*nt+2] */
-static void compute_pt_data(struct findpts_el_data_3 *fd)
-{
-  const unsigned nr = fd->n[0], nt = fd->n[2];
-  const struct findpts_el_gedge_3 *e = get_edge(fd,0);
-  unsigned d,i;
-  double *work = fd->work;
-  for(i=0;i<4;++i) for(d=0;d<3;++d)
-    fd->pt[2*i  ].x[d] = e[i].x[d][0],
-    fd->pt[2*i  ].jac[3*d+1] = e[i].dxdn1[d][0],
-    fd->pt[2*i  ].jac[3*d+2] = e[i].dxdn2[d][0],
-    fd->pt[2*i  ].hes[6*d+3] = e[i].d2xdn1[d][0],
-    fd->pt[2*i  ].hes[6*d+5] = e[i].d2xdn2[d][0],
-    fd->pt[2*i+1].x[d] = e[i].x[d][nr-1],
-    fd->pt[2*i+1].jac[3*d+1] = e[i].dxdn1[d][nr-1],
-    fd->pt[2*i+1].jac[3*d+2] = e[i].dxdn2[d][nr-1],
-    fd->pt[2*i+1].hes[6*d+3] = e[i].d2xdn1[d][nr-1],
-    fd->pt[2*i+1].hes[6*d+5] = e[i].d2xdn2[d][nr-1];
-  memcpy(work     , fd->wtend[0]+  nr, 2*nr*sizeof(double));
-  memcpy(work+2*nr, fd->wtend[0]+4*nr, 2*nr*sizeof(double));
-  for(i=0;i<4;++i) for(d=0;d<3;++d) {
-    tensor_mtxv(work+4*nr,4, work, e[i].x[d],nr);
-    fd->pt[2*i  ].jac[3*d  ] = work[4*nr  ];
-    fd->pt[2*i  ].hes[6*d  ] = work[4*nr+1];
-    fd->pt[2*i+1].jac[3*d  ] = work[4*nr+2];
-    fd->pt[2*i+1].hes[6*d  ] = work[4*nr+3];
-  }
-  memcpy(work+nr,work+2*nr,nr*sizeof(double));
-  for(i=0;i<4;++i) for(d=0;d<3;++d) {
-    tensor_mtxv(work+2*nr,2, work, e[i].dxdn1[d],nr);
-    fd->pt[2*i  ].hes[6*d+1] = work[2*nr  ];
-    fd->pt[2*i+1].hes[6*d+1] = work[2*nr+1];
-    tensor_mtxv(work+2*nr,2, work, e[i].dxdn2[d],nr);
-    fd->pt[2*i  ].hes[6*d+2] = work[2*nr  ];
-    fd->pt[2*i+1].hes[6*d+2] = work[2*nr+1];
-  }
-  e = get_edge(fd,8);
-  memcpy(work   , fd->wtend[2]+  nt, nt*sizeof(double));
-  memcpy(work+nt, fd->wtend[2]+4*nt, nt*sizeof(double));
-  for(i=0;i<4;++i) for(d=0;d<3;++d) {
-    tensor_mtxv(work+2*nt,2, work, e[i].dxdn2[d],nt);
-    fd->pt[  i].hes[6*d+4] = work[2*nt  ];
-    fd->pt[4+i].hes[6*d+4] = work[2*nt+1];
-  }
-}
-
-static const struct findpts_el_gpt_3 *get_pt(
-  struct findpts_el_data_3 *fd, unsigned pi)
-{
-  if((fd->side_init&0x40u)==0)
-    compute_pt_data(fd), fd->side_init |= 0x40u;
-  return &fd->pt[pi];
-}
-
-/* check reduction in objective against prediction, and adjust
-   trust region radius (p->tr) accordingly;
-   may reject the prior step, returning 1; otherwise returns 0
-   sets out->dist2, out->index, out->x, out->oldr in any event,
-   leaving out->r, out->dr, out->flags to be set when returning 0 */
-static int reject_prior_step_q(struct findpts_el_pt_3 *const out,
-                               const double resid[3],
-                               const struct findpts_el_pt_3 *const p,
-                               const double tol)
-{
-  const double old_dist2 = p->dist2;
-  const double dist2 = resid[0]*resid[0]+resid[1]*resid[1]+resid[2]*resid[2];
-  const double decr = old_dist2-dist2;
-  const double pred = p->dist2p;
-  out->x[0]=p->x[0],out->x[1]=p->x[1],out->x[2]=p->x[2];
-  out->oldr[0]=p->r[0],out->oldr[1]=p->r[1],out->oldr[2]=p->r[2];
-  out->index=p->index;
-  out->dist2=dist2;
-#ifdef DIAGNOSTICS_2
-  printf("Checking prior step:\n"
-         "       old r = (%.17g,%.17g,%.17g), old flags = %x\n"
-         "   old_dist2 = %.17g\n"
-         "           r = (%.17g,%.17g,%.17g),     flags = %x\n"
-         "       dist2 = %.17g\n"
-         "  difference = %.17g\n"
-         "   predicted = %.17g\n"
-         "         rho = %.17g\n",
-         p->oldr[0],p->oldr[1],p->oldr[2],(p->flags>>7)&FLAG_MASK,old_dist2,
-         p->r[0],p->r[1],p->r[2],p->flags&FLAG_MASK,dist2,
-         decr, pred, decr/pred);
-#endif
-  if(decr>= 0.01 * pred) {
-    if(decr>= 0.9 * pred) {
-      out->tr = p->tr*2;
-#ifdef DIAGNOSTICS_2
-      printf("  very good iteration; tr -> %g\n", out->tr);
-#endif
-    } else {
-#ifdef DIAGNOSTICS_2
-      printf("  good iteration; tr = %g\n", p->tr);
-#endif
-      out->tr = p->tr;
-    }
-    return 0;
-  } else {
-    /* reject step; note: the point will pass through this routine
-       again, and we set things up here so it gets classed as a
-       "very good iteration" --- this doubles the trust radius,
-       which is why we divide by 4 below */
-    double v0 = fabs(p->r[0]-p->oldr[0]),
-           v1 = fabs(p->r[1]-p->oldr[1]),
-           v2 = fabs(p->r[2]-p->oldr[2]);
-    out->tr = (v1>v2?(v0>v1?v0:v1):(v0>v2?v0:v2))/4;
-#ifdef DIAGNOSTICS_2
-    printf("  bad iteration; tr -> %g\n", out->tr);
-#endif
-    out->dist2=old_dist2;
-    out->r[0]=p->oldr[0],out->r[1]=p->oldr[1],out->r[2]=p->oldr[2];
-    out->flags=p->flags>>7;
-    out->dist2p=-DBL_MAX;
-    if(pred < dist2*tol) out->flags|=CONVERGED_FLAG;
-    return 1;
-  }
-}
-
-/* minimize ||resid - jac * dr||_2, with |dr| <= tr, |r0+dr|<=1
-   (exact solution of trust region problem) */
-static void newton_vol(struct findpts_el_pt_3 *const out,
-                       const double jac[9], const double resid[3],
-                       const struct findpts_el_pt_3 *const p, const double tol)
-{
-  const double tr = p->tr;
-  double bnd[6] = { -1,1, -1,1, -1,1 };
-  double r0[3];
-  double dr[3], fac;
-  unsigned d, mask, flags;
-  r0[0]=p->r[0],r0[1]=p->r[1],r0[2]=p->r[2];
-#ifdef DIAGNOSTICS_1
-  printf("newton_vol:\n");
-  printf("  resid = (%g,%g,%g); r^T r / 2 = %g\n",resid[0],resid[1],resid[2],
-         (resid[0]*resid[0]+resid[1]*resid[1]+resid[2]*resid[2])/2);
-  printf("  jac = %g\t%g\t%g\n"
-         "        %g\t%g\t%g\n"
-         "        %g\t%g\t%g\n",
-         jac[0],jac[1],jac[2],jac[3],jac[4],jac[5],jac[6],jac[7],jac[8]);
-  printf("  r = (%.15g,%.15g,%.15g)\n",r0[0],r0[1],r0[2]);
-#endif
-
-  mask = 0x3fu;
-  for(d=0;d<3;++d) {
-    if(r0[d]-tr>-1) bnd[2*d  ]=r0[d]-tr, mask^=1u<<(2*d);
-    if(r0[d]+tr< 1) bnd[2*d+1]=r0[d]+tr, mask^=2u<<(2*d);
-  }
-
-  lin_solve_3(dr, jac,resid);
-
-#ifdef DIAGNOSTICS_1
-  printf("  min at r = (%.17g,%.17g,%.17g)\n",
-         r0[0]+dr[0],r0[1]+dr[1],r0[2]+dr[2]);
-#endif
-
-  fac = 1, flags = 0;
-  for(d=0;d<3;++d) {
-    double nr = r0[d]+dr[d];
-    if((nr-bnd[2*d])*(bnd[2*d+1]-nr)>=0) continue;
-    if(nr<bnd[2*d]) {
-      double f = (bnd[2*d  ]-r0[d])/dr[d];
-      if(f<fac) fac=f, flags = 1u<<(2*d);
-    } else {
-      double f = (bnd[2*d+1]-r0[d])/dr[d];
-      if(f<fac) fac=f, flags = 2u<<(2*d);
-    }
-  }
-
-#ifdef DIAGNOSTICS_1
-  printf("  flags = %x, fac = %.15g\n",flags,fac);
-#endif
-
-  if(flags==0) goto newton_vol_fin;
-
-  for(d=0;d<3;++d) dr[d]*=fac;
-
-  newton_vol_face: {
-    const unsigned fi = face_index(flags);
-    const unsigned dn = fi>>1, d1 = plus_1_mod_3(dn), d2 = plus_2_mod_3(dn);
-    double drc[2], ffac=1;
-    unsigned new_flags=0;
-    double res[3], y[2], JtJ[3];
-    res[0] = resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]+jac[2]*dr[2]),
-    res[1] = resid[1]-(jac[3]*dr[0]+jac[4]*dr[1]+jac[5]*dr[2]),
-    res[2] = resid[2]-(jac[6]*dr[0]+jac[7]*dr[1]+jac[8]*dr[2]);
-    /* y = J_u^T res */
-    y[0] = jac[d1]*res[0]+jac[3+d1]*res[1]+jac[6+d1]*res[2],
-    y[1] = jac[d2]*res[0]+jac[3+d2]*res[1]+jac[6+d2]*res[2];
-    /* JtJ = J_u^T J_u */
-    JtJ[0] = jac[  d1]*jac[  d1]
-            +jac[3+d1]*jac[3+d1]
-            +jac[6+d1]*jac[6+d1],
-    JtJ[1] = jac[  d1]*jac[  d2]
-            +jac[3+d1]*jac[3+d2]
-            +jac[6+d1]*jac[6+d2],
-    JtJ[2] = jac[  d2]*jac[  d2]
-            +jac[3+d2]*jac[3+d2]
-            +jac[6+d2]*jac[6+d2];
-    lin_solve_sym_2(drc, JtJ,y);
-#ifdef DIAGNOSTICS_1
-    printf("  face %u, dn=%u, (d1,d2)=(%u,%u)\n",fi,dn,d1,d2);
-    printf("    r=(%.17g,%.17g,%.17g)\n", r0[0]+dr[0],r0[1]+dr[1],r0[2]+dr[2]);
-    printf("    resid = (%g,%g,%g); r^T r / 2 = %g\n",res[0],res[1],res[2],
-           (res[0]*res[0]+res[1]*res[1]+res[2]*res[2])/2);
-    printf("    min at (%.17g,%.17g)\n",
-           r0[d1]+dr[d1]+drc[0],r0[d2]+dr[d2]+drc[1]);
-#endif
-    #define CHECK_CONSTRAINT(drcd,d3) do { \
-      const double rz = r0[d3]+dr[d3], lb=bnd[2*d3],ub=bnd[2*d3+1]; \
-      const double delta=drcd, nr = r0[d3]+(dr[d3]+delta); \
-      if((nr-lb)*(ub-nr)<0) { \
-        if(nr<lb) { \
-          double f = (lb-rz)/delta; \
-          if(f<ffac) ffac=f, new_flags = 1u<<(2*d3); \
-        } else { \
-          double f = (ub-rz)/delta; \
-          if(f<ffac) ffac=f, new_flags = 2u<<(2*d3); \
-        } \
-      } \
-    } while(0)
-    CHECK_CONSTRAINT(drc[0],d1); CHECK_CONSTRAINT(drc[1],d2);
-#ifdef DIAGNOSTICS_1
-    printf("    new_flags = %x, ffac = %.17g\n",new_flags,ffac);
-#endif
-    dr[d1] += ffac*drc[0], dr[d2] += ffac*drc[1];
-    if(new_flags==0) goto newton_vol_fin;
-    flags |= new_flags;
-  }
-
-  newton_vol_edge: {
-    const unsigned ei = edge_index(flags);
-    const unsigned de = ei>>2;
-    double ffac = 1;
-    unsigned new_flags = 0;
-    double res[3],y,JtJ,drc;
-    res[0] = resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]+jac[2]*dr[2]),
-    res[1] = resid[1]-(jac[3]*dr[0]+jac[4]*dr[1]+jac[5]*dr[2]),
-    res[2] = resid[2]-(jac[6]*dr[0]+jac[7]*dr[1]+jac[8]*dr[2]);
-    /* y = J_u^T res */
-    y = jac[de]*res[0]+jac[3+de]*res[1]+jac[6+de]*res[2];
-    /* JtJ = J_u^T J_u */
-    JtJ = jac[  de]*jac[  de]
-         +jac[3+de]*jac[3+de]
-         +jac[6+de]*jac[6+de];
-    drc = y/JtJ;
-#ifdef DIAGNOSTICS_1
-    printf("  edge %u, de=%u\n",ei,de);
-    printf("    r=(%.17g,%.17g,%.17g)\n", r0[0]+dr[0],r0[1]+dr[1],r0[2]+dr[2]);
-    printf("    resid = (%g,%g,%g); r^T r / 2 = %g\n",res[0],res[1],res[2],
-           (res[0]*res[0]+res[1]*res[1]+res[2]*res[2])/2);
-    printf("    min at %.17g\n", r0[de]+dr[de]+drc);
-#endif
-    CHECK_CONSTRAINT(drc,de);
-    #undef CHECK_CONSTRAINT
-#ifdef DIAGNOSTICS_1
-    printf("    new_flags = %x, ffac = %.17g\n",new_flags,ffac);
-#endif
-    dr[de] += ffac*drc;
-    flags |= new_flags;
-    goto newton_vol_relax;
-  }
-
-  /* check and possibly relax constraints */
-  newton_vol_relax: {
-    const unsigned old_flags = flags;
-    double res[3], y[3];
-    /* res := res_0 - J dr */
-    res[0] = resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]+jac[2]*dr[2]),
-    res[1] = resid[1]-(jac[3]*dr[0]+jac[4]*dr[1]+jac[5]*dr[2]),
-    res[2] = resid[2]-(jac[6]*dr[0]+jac[7]*dr[1]+jac[8]*dr[2]);
-    /* y := J^T res */
-    y[0] = jac[0]*res[0]+jac[3]*res[1]+jac[6]*res[2],
-    y[1] = jac[1]*res[0]+jac[4]*res[1]+jac[7]*res[2],
-    y[2] = jac[2]*res[0]+jac[5]*res[1]+jac[8]*res[2];
-    #define SETDR(d) do { \
-      unsigned f = flags>>(2*d) & 3u; \
-      if(f) dr[d] = bnd[2*d+(f-1)] - r0[d]; \
-    } while(0)
-    SETDR(0); SETDR(1); SETDR(2);
-    #undef SETDR
-    for(d=0;d<3;++d) {
-      unsigned c = flags>>(2*d) & 3u;
-      if(c==0) continue;
-      else if(dr[d]*y[d]<0) flags &= ~(3u<<(2*d));
-#ifdef DIAGNOSTICS_1
-      if( (c==1&&dr[d]>0) || (c==2&&dr[d]<0) )
-        printf("FAIL! c=%u, dr[d]=%g\n",c,dr[d]);
-#endif
-    }
-#ifdef DIAGNOSTICS_1
-    printf("  checking constraints (%x)\n",old_flags);
-    printf("    r=(%.17g,%.17g,%.17g)\n", r0[0]+dr[0],r0[1]+dr[1],r0[2]+dr[2]);
-    printf("    resid = (%g,%g,%g); r^T r / 2 = %g\n",res[0],res[1],res[2],
-           (res[0]*res[0]+res[1]*res[1]+res[2]*res[2])/2);
-    printf("    relaxed %x -> %x\n",old_flags,flags);
-#endif
-    if(flags==old_flags) goto newton_vol_fin;
-    switch(num_constrained(flags)) {
-      case 1: goto newton_vol_face;
-      case 2: goto newton_vol_edge;
-    }
-  }
-
-newton_vol_fin:
-#ifdef DIAGNOSTICS_1
-  {
-    const double res[3]={ resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]+jac[2]*dr[2]),
-                          resid[1]-(jac[3]*dr[0]+jac[4]*dr[1]+jac[5]*dr[2]),
-                          resid[2]-(jac[6]*dr[0]+jac[7]*dr[1]+jac[8]*dr[2]) };
-    printf("  r=(%.17g,%.17g,%.17g)\n", r0[0]+dr[0],r0[1]+dr[1],r0[2]+dr[2]);
-    printf("  resid = (%g,%g,%g); r^T r / 2 = %g\n",res[0],res[1],res[2],
-           (res[0]*res[0]+res[1]*res[1]+res[2]*res[2])/2);
-  }
-#endif
-  flags &= mask;
-  if(fabs(dr[0])+fabs(dr[1])+fabs(dr[2]) < tol) flags |= CONVERGED_FLAG;
-  {
-    const double res0 = resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]+jac[2]*dr[2]),
-                 res1 = resid[1]-(jac[3]*dr[0]+jac[4]*dr[1]+jac[5]*dr[2]),
-                 res2 = resid[2]-(jac[6]*dr[0]+jac[7]*dr[1]+jac[8]*dr[2]);
-    out->dist2p=resid[0]*resid[0]+resid[1]*resid[1]+resid[2]*resid[2]
-                -(res0*res0+res1*res1+res2*res2);
-  }
-  #define SETR(d) do { \
-    unsigned f = flags>>(2*d) & 3u; \
-    out->r[d] = f==0 ? r0[d]+dr[d] : ( f==1 ? -1 : 1 ); \
-  } while(0)
-  SETR(0); SETR(1); SETR(2);
-  #undef SETR
-  out->flags = flags | (p->flags<<7);
-}
-
-static void newton_face(struct findpts_el_pt_3 *const out,
-                        const double jac[9], const double rhes[3],
-                        const double resid[3],
-                        const unsigned d1, const unsigned d2, const unsigned dn,
-                        const unsigned flags,
-                        const struct findpts_el_pt_3 *const p, const double tol)
-{
-  const double tr = p->tr;
-  double bnd[4];
-  double r[2], dr[2]={0,0};
-  unsigned mask, new_flags;
-  double v, tv; unsigned i;
-  double A[3], y[2], r0[2];
-  /* A = J^T J - resid_d H_d */
-  A[0] = jac[  d1]*jac[  d1]
-        +jac[3+d1]*jac[3+d1]
-        +jac[6+d1]*jac[6+d1] - rhes[0],
-  A[1] = jac[  d1]*jac[  d2]
-        +jac[3+d1]*jac[3+d2]
-        +jac[6+d1]*jac[6+d2] - rhes[1],
-  A[2] = jac[  d2]*jac[  d2]
-        +jac[3+d2]*jac[3+d2]
-        +jac[6+d2]*jac[6+d2] - rhes[2];
-  /* y = J^T r */
-  y[0] = jac[  d1]*resid[0]
-        +jac[3+d1]*resid[1]
-        +jac[6+d1]*resid[2],
-  y[1] = jac[  d2]*resid[0]
-        +jac[3+d2]*resid[1]
-        +jac[6+d2]*resid[2];
-  r0[0] = p->r[d1], r0[1] = p->r[d2];
-
-#ifdef DIAGNOSTICS_1
-  printf("newton_face, dn=%u, (d1,d2)=%u,%u:\n", dn,d1,d2);
-  printf("  J^T r = (%g,%g)\n", y[0],y[1]);
-  printf("  A = %g\t%g\n"
-         "      %g\t%g\n", A[0],A[1],A[1],A[2]);
-  printf("  r = (%.15g,%.15g)\n", r0[0],r0[1]);
-#endif
-
-  new_flags=flags;
-  mask=0x3fu;
-  if(r0[0]-tr>-1) bnd[0]=-tr, mask^=1u;    else bnd[0]=-1-r0[0];
-  if(r0[0]+tr< 1) bnd[1]= tr, mask^=2u;    else bnd[1]= 1-r0[0];
-  if(r0[1]-tr>-1) bnd[2]=-tr, mask^=1u<<2; else bnd[2]=-1-r0[1];
-  if(r0[1]+tr< 1) bnd[3]= tr, mask^=2u<<2; else bnd[3]= 1-r0[1];
-
-#ifdef DIAGNOSTICS_1
-  printf("  bounds = ([%.15g,%.15g],[%.15g,%.15g])\n",
-         r0[0]+bnd[0],r0[0]+bnd[1],r0[1]+bnd[2],r0[1]+bnd[3]);
-#endif
-
-  if(A[0]+A[2]<=0 || A[0]*A[2]<=A[1]*A[1]) goto newton_face_constrained;
-  lin_solve_sym_2(dr, A,y);
-
-#ifdef DIAGNOSTICS_1
-  printf("  min at r = (%.15g,%.15g)\n", r0[0]+dr[0],r0[1]+dr[1]);
-#endif
-
-  #define EVAL(r,s) -(y[0]*r+y[1]*s)+(r*A[0]*r+(2*r*A[1]+s*A[2])*s)/2
-  if(   (dr[0]-bnd[0])*(bnd[1]-dr[0])>=0
-     && (dr[1]-bnd[2])*(bnd[3]-dr[1])>=0) {
-    r[0] = r0[0]+dr[0], r[1] = r0[1]+dr[1];
-    v = EVAL(dr[0],dr[1]);
-    goto newton_face_fin;
-  }
-newton_face_constrained:
-  v  = EVAL(bnd[0],bnd[2]); i=1u|(1u<<2);
-  tv = EVAL(bnd[1],bnd[2]); if(tv<v) v=tv, i=2u|(1u<<2);
-  tv = EVAL(bnd[0],bnd[3]); if(tv<v) v=tv, i=1u|(2u<<2);
-  tv = EVAL(bnd[1],bnd[3]); if(tv<v) v=tv, i=2u|(2u<<2);
-  if(A[0]>0) {
-    double drc;
-    drc = (y[0] - A[1]*bnd[2])/A[0];
-    if((drc-bnd[0])*(bnd[1]-drc)>=0 && (tv=EVAL(drc,bnd[2]))<v)
-      v=tv,i=1u<<2,dr[0]=drc;
-    drc = (y[0] - A[1]*bnd[3])/A[0];
-    if((drc-bnd[0])*(bnd[1]-drc)>=0 && (tv=EVAL(drc,bnd[3]))<v)
-      v=tv,i=2u<<2,dr[0]=drc;
-  }
-  if(A[2]>0) {
-    double drc;
-    drc = (y[1] - A[1]*bnd[0])/A[2];
-    if((drc-bnd[2])*(bnd[3]-drc)>=0 && (tv=EVAL(bnd[0],drc))<v)
-      v=tv,i=1u,dr[1]=drc;
-    drc = (y[1] - A[1]*bnd[1])/A[2];
-    if((drc-bnd[2])*(bnd[3]-drc)>=0 && (tv=EVAL(bnd[1],drc))<v)
-      v=tv,i=2u,dr[1]=drc;
-  }
-  #undef EVAL
-
-  #define SETR(d,d3) do { \
-    const unsigned f = i>>(2*d) & 3u; \
-    if(f==0) r[d]=r0[d]+dr[d]; \
-    else { \
-      if((f&(mask>>(2*d)))==0) r[d]=r0[d]+(f==1?-tr:tr); \
-      else                     r[d]=(f==1?-1:1), new_flags |= f<<(2*d3); \
-    } \
-  } while(0)
-  SETR(0,d1); SETR(1,d2);
-#ifdef DIAGNOSTICS_1
-  printf("  constrained min at r = (%.15g,%.15g)\n", r[0],r[1]);
-#endif
-newton_face_fin:
-  out->dist2p = -2*v;
-  dr[0]=r[0]-p->r[d1];
-  dr[1]=r[1]-p->r[d2];
-  if(fabs(dr[0])+fabs(dr[1]) < tol) new_flags |= CONVERGED_FLAG;
-  out->r[dn]=p->r[dn], out->r[d1]=r[0],out->r[d2]=r[1];
-  out->flags = new_flags | (p->flags<<7);
-}
-
-static void newton_edge(struct findpts_el_pt_3 *const out,
-  const double jac[9], const double rhes, const double resid[3],
-  const unsigned de, const unsigned dn1, const unsigned dn2,
-  unsigned flags,
-  const struct findpts_el_pt_3 *const p, const double tol)
-{
-  const double tr = p->tr;
-  /* A = J^T J - resid_d H_d */
-  const double A = jac[  de]*jac[  de]
-                  +jac[3+de]*jac[3+de]
-                  +jac[6+de]*jac[6+de] - rhes;
-  /* y = J^T r */
-  const double y = jac[  de]*resid[0]
-                  +jac[3+de]*resid[1]
-                  +jac[6+de]*resid[2];
-
-  const double oldr = p->r[de];
-  double dr,nr,tdr,tnr;
-  double v,tv; unsigned new_flags=0, tnew_flags=0;
-
-#ifdef DIAGNOSTICS_1
-  printf("Newton edge %u (dn1=%u,dn2=%u) flags=%x\n",de,dn1,dn2,flags);
-  printf("  A=%g, y=%g\n",A,y);
-  if(A<=0) printf("  A not positive\n");
-  printf("  r=(%g,%g,%g)\n",p->r[0],p->r[1],p->r[2]);
-#endif
-
-  #define EVAL(dr) (dr*A-2*y)*dr
-
-  /* if A is not SPD, quadratic model has no minimum */
-  if(A>0) {
-    dr = y/A, nr = oldr+dr;
-    if(fabs(dr)<tr && fabs(nr)<1) { v=EVAL(dr); goto newton_edge_fin; }
-  }
-
-  if(( nr=oldr-tr)>-1)  dr=-tr;
-  else                  nr=-1,  dr=-1-oldr,  new_flags = flags | 1u<<(2*de);
-  v =EVAL( dr);
-
-  if((tnr=oldr+tr)< 1) tdr=tr;
-  else                 tnr= 1, tdr= 1-oldr, tnew_flags = flags | 2u<<(2*de);
-  tv=EVAL(tdr);
-
-  if(tv<v) nr=tnr, dr=tdr, v=tv, new_flags=tnew_flags;
-
-newton_edge_fin:
-  /* check convergence */
-  if(fabs(dr) < tol) new_flags |= CONVERGED_FLAG;
-  out->r[de]=nr;
-  out->r[dn1]=p->r[dn1];
-  out->r[dn2]=p->r[dn2];
-  out->dist2p = -v;
-  out->flags = flags | new_flags | (p->flags<<7);
-#ifdef DIAGNOSTICS_1
-  printf("  new r = (%g,%g,%g)\n",out->r[0],out->r[1],out->r[2]);
-#endif
-}
-
-typedef void findpt_fun(
-  struct findpts_el_pt_3 *const out,
-  struct findpts_el_data_3 *const fd,
-  const struct findpts_el_pt_3 *const p, const unsigned pn, const double tol);
-
-/* work[(3+9+2*(nr+ns+nt+nrs))*pn + max(2*nr,ns) ] */
-static void findpt_vol(
-  struct findpts_el_pt_3 *const out,
-  struct findpts_el_data_3 *const fd,
-  const struct findpts_el_pt_3 *const p, const unsigned pn, const double tol)
-{
-  const unsigned nr=fd->n[0],ns=fd->n[1],nt=fd->n[2],
-                 nrs=nr*ns;
-  double *const resid = fd->work, *const jac = resid + 3*pn,
-         *const wtrs = jac+9*pn, *const wtt = wtrs+2*(nr+ns)*pn,
-         *const slice = wtt+2*nt*pn, *const temp = slice + 2*pn*nrs;
-  unsigned i; unsigned d;
-  /* evaluate x(r) and jacobian */
-  for(i=0;i<pn;++i)
-    fd->lag[0](wtrs+2*i*(nr+ns)     , fd->lag_data[0], nr, 1, p[i].r[0]);
-  for(i=0;i<pn;++i)
-    fd->lag[1](wtrs+2*i*(nr+ns)+2*nr, fd->lag_data[1], ns, 1, p[i].r[1]);
-  for(i=0;i<pn;++i)
-    fd->lag[2](wtt+2*i*nt           , fd->lag_data[2], nt, 1, p[i].r[2]);
-  for(d=0;d<3;++d) {
-    tensor_mxm(slice,nrs, fd->x[d],nt, wtt,2*pn);
-    for(i=0;i<pn;++i) {
-      const double *const wtrs_i = wtrs+2*i*(nr+ns),
-                   *const slice_i = slice+2*i*nrs;
-      double *const jac_i = jac+9*i+3*d;
-      resid[3*i+d] = p[i].x[d] - tensor_ig2(jac_i,
-        wtrs_i,nr, wtrs_i+2*nr,ns, slice_i, temp);
-      jac_i[2] = tensor_i2(wtrs_i,nr, wtrs_i+2*nr,ns, slice_i+nrs, temp);
-    }
-  }
-  /* perform Newton step */
-  for(i=0;i<pn;++i) {
-    if(reject_prior_step_q(out+i,resid+3*i,p+i,tol)) continue;
-    else newton_vol(out+i, jac+9*i, resid+3*i, p+i, tol);
-  }
-}
-
-/* work[(3+9+3+3*(n1+n2+n1))*pn ] */
-static void findpt_face(
-  struct findpts_el_pt_3 *const out,
-  struct findpts_el_data_3 *const fd,
-  const struct findpts_el_pt_3 *const p, const unsigned pn, const double tol)
-{
-  const unsigned pflag = p->flags & FLAG_MASK;
-  const unsigned fi = face_index(pflag);
-  const unsigned dn = fi>>1, d1 = plus_1_mod_3(dn), d2 = plus_2_mod_3(dn);
-  const unsigned n1 = fd->n[d1], n2 = fd->n[d2];
-  double *const resid=fd->work, *const jac=resid+3*pn, *const hes=jac+9*pn,
-         *const wt1 = hes+3*pn, *const wt2 = wt1+3*n1*pn,
-         *const slice = wt2+3*n2*pn;
-  const struct findpts_el_gface_3 *const face = get_face(fd,fi);
-  unsigned i; unsigned d;
-
-#ifdef DIAGNOSTICS_1
-  printf("Face %u\n",fi);
-  printf("  pflag = %u\n",pflag);
-  printf("  fi = %u\n",fi);
-  printf("  dn, d1, d2 = %u, %u, %u\n",dn,d1,d2);
-  printf("  n1, n2 = %u, %u \n", n1,n2);
-#endif
-
-  /* evaluate x(r), jacobian, hessian */
-  for(i=0;i<pn;++i)
-    fd->lag[d1](wt1+3*i*n1, fd->lag_data[d1], n1, 2, p[i].r[d1]);
-  for(i=0;i<pn;++i)
-    fd->lag[d2](wt2+3*i*n2, fd->lag_data[d2], n2, 2, p[i].r[d2]);
-  for(i=0;i<3*pn;++i) hes[i]=0;
-  for(d=0;d<3;++d) {
-    tensor_mxm(slice,n1, face->x[d],n2, wt2,3*pn);
-    for(i=0;i<pn;++i) {
-      const double *const wt1_i = wt1+3*i*n1, *const slice_i = slice+3*i*n1;
-      double v[9], r;
-      tensor_mtxm(v,3, wt1_i,n1, slice_i,3);
-      /* v[3*j + i] = d^i/dr1^i d^j/dr2^j x_d */
-      resid[3*i+d] = r = p[i].x[d] - v[0];
-      jac[9*i+3*d+d1] = v[1];
-      jac[9*i+3*d+d2] = v[3];
-      hes[3*i  ] += r * v[2];
-      hes[3*i+1] += r * v[4];
-      hes[3*i+2] += r * v[6];
-    }
-  }
-  for(i=1;i<pn;++i) memcpy(wt2+i*n2, wt2+3*i*n2, n2*sizeof(double));
-  for(d=0;d<3;++d) {
-    tensor_mxm(slice,n1, face->dxdn[d],n2, wt2,pn);
-    for(i=0;i<pn;++i)
-      jac[9*i+3*d+dn] = tensor_dot(wt1+3*i*n1, slice+i*n1, n1);
-  }
-  /* perform Newton step */
-  for(i=0;i<pn;++i) {
-    double *const resid_i=resid+3*i, *const jac_i=jac+9*i, *const hes_i=hes+3*i;
-    /* check prior step */
-    if(!reject_prior_step_q(out+i,resid_i,p+i,tol)) {
-      /* check constraint */
-      const double steep = resid_i[0] * jac_i[  dn]
-                          +resid_i[1] * jac_i[3+dn]
-                          +resid_i[2] * jac_i[6+dn];
-#ifdef DIAGNOSTICS_1
-      printf("jacobian = %g\t%g\t%g\n"
-             "           %g\t%g\t%g\n"
-             "           %g\t%g\t%g\n",jac_i[0],jac_i[1],jac_i[2],
-             jac_i[3],jac_i[4],jac_i[5],jac_i[6],jac_i[7],jac_i[8]);
-      printf("resid_i = (%g,%g,%g)\n", resid_i[0],resid_i[1],resid_i[2]);
-      printf("steep = %g (%s)\n", steep, steep * p[i].r[dn] < 0 ? "in" : "out");
-#endif
-      if(steep * p[i].r[dn] < 0) /* relax constraint */
-        newton_vol(out+i, jac_i, resid_i, p+i, tol);
-      else
-        newton_face(out+i, jac_i, hes_i, resid_i, d1,d2,dn,pflag, p+i, tol);
-    }
-  }
-}
-
-/* work[ 3*n ] */
-static void findpt_edge(
-  struct findpts_el_pt_3 *const out,
-  struct findpts_el_data_3 *const fd,
-  const struct findpts_el_pt_3 *const p, const unsigned pn, const double tol)
-{
-  const unsigned pflag = p->flags & FLAG_MASK;
-  const unsigned ei = edge_index(pflag);
-  const unsigned de = ei>>2, dn1 = plus_1_mod_3(de), dn2 = plus_2_mod_3(de);
-  const unsigned n = fd->n[de];
-  double *wt = fd->work;
-  const struct findpts_el_gedge_3 *edge = get_edge(fd,ei);
-  unsigned i; unsigned d;
-
-#ifdef DIAGNOSTICS_1
-  printf("Edge %u\n",ei);
-  printf("  pflag = %u\n",pflag);
-  printf("  ei = %u\n",ei);
-  printf("  de, dn1, dn2 = %u, %u, %u\n",de,dn1,dn2);
-  printf("  n = %u \n", n);
-#endif
-
-  for(i=0;i<pn;++i) {
-    double dxi[3], resid[3], jac[9];
-    double hes[5] = {0,0,0,0,0};
-    /* evaluate x(r), jacobian, hessian */
-    fd->lag[de](wt, fd->lag_data[de], n, 2, p[i].r[de]);
-    for(d=0;d<3;++d) {
-      double r;
-      tensor_mtxv(dxi,3, wt, edge->x[d],n);
-      resid[d] = r = p[i].x[d] - dxi[0];
-      jac[3*d+de] = dxi[1];
-      hes[0] += r * dxi[2];
-      tensor_mtxv(dxi,2, wt, edge->dxdn1[d],n);
-      jac[3*d+dn1] = dxi[0];
-      hes[1] += r * dxi[1];
-      tensor_mtxv(dxi,2, wt, edge->dxdn2[d],n);
-      jac[3*d+dn2] = dxi[0];
-      hes[2] += r * dxi[1];
-      hes[3] += r * tensor_dot(wt, edge->d2xdn1[d], n);
-      hes[4] += r * tensor_dot(wt, edge->d2xdn2[d], n);
-    }
-    /* check prior step */
-    if(reject_prior_step_q(out+i,resid,p+i,tol)) continue;
-    /* check constraint */
-    {
-      double steep[3], sr1, sr2;
-      steep[0] = jac[0]*resid[0] + jac[3]*resid[1] + jac[6]*resid[2],
-      steep[1] = jac[1]*resid[0] + jac[4]*resid[1] + jac[7]*resid[2],
-      steep[2] = jac[2]*resid[0] + jac[5]*resid[1] + jac[8]*resid[2];
-      sr1 = steep[dn1]*p[i].r[dn1],
-      sr2 = steep[dn2]*p[i].r[dn2];
-#ifdef DIAGNOSTICS_1
-    printf("jacobian = %g\t%g\t%g\n"
-           "           %g\t%g\t%g\n"
-           "           %g\t%g\t%g\n",jac[0],jac[1],jac[2],
-           jac[3],jac[4],jac[5],jac[6],jac[7],jac[8]);
-    printf("hessian = %g\t%g\t%g\n"
-           "            \t%g    \n"
-           "            \t  \t%g\n", hes[0],hes[1],hes[2],hes[3],hes[4]);
-    printf("resid = (%g,%g,%g)\n", resid[0],resid[1],resid[2]);
-    printf("steep1 = %g (%s)\n", steep[dn1], sr1 < 0 ? "in" : "out");
-    printf("steep2 = %g (%s)\n", steep[dn2], sr2 < 0 ? "in" : "out");
-#endif
-      if(sr1<0) {
-        if(sr2<0)
-          newton_vol(out+i, jac,resid, p+i, tol);
-        else {
-          double rh[3]; rh[0]=hes[0], rh[1]=hes[1], rh[2]=hes[3];
-          newton_face(out+i, jac,rh,resid, de,dn1,dn2,
-                      pflag & (3u<<(dn2*2)), p+i, tol);
-        }
-      } else if(sr2<0) {
-          double rh[3]; rh[0]=hes[4], rh[1]=hes[2], rh[2]=hes[0];
-          newton_face(out+i, jac,rh,resid, dn2,de,dn1,
-                      pflag & (3u<<(dn1*2)), p+i, tol);
-      } else
-        newton_edge(out+i, jac,hes[0],resid, de,dn1,dn2, pflag, p+i, tol);
-    }
-  }
-}
-
-static void findpt_pt(
-  struct findpts_el_pt_3 *const out,
-  struct findpts_el_data_3 *const fd,
-  const struct findpts_el_pt_3 *const p, const unsigned pn, const double tol)
-{
-  const unsigned pflag = p->flags & FLAG_MASK;
-  const unsigned pi = point_index(pflag);
-  const struct findpts_el_gpt_3 *gpt = get_pt(fd,pi);
-  const double *const x = gpt->x, *const jac = gpt->jac, *const hes = gpt->hes;
-  unsigned i;
-
-#ifdef DIAGNOSTICS_1
-  printf("Point %u\n",pi);
-  printf("  pflag = %u\n",pflag);
-  printf("  pi = %u\n",pi);
-#endif
-
-  for(i=0;i<pn;++i) {
-    unsigned d1,d2,dn, de,dn1,dn2, hi0,hi1,hi2;
-    double resid[3], steep[3], sr[3];
-    resid[0] = p[i].x[0]-x[0],
-    resid[1] = p[i].x[1]-x[1],
-    resid[2] = p[i].x[2]-x[2];
-    steep[0] = jac[0]*resid[0] + jac[3]*resid[1] + jac[6]*resid[2],
-    steep[1] = jac[1]*resid[0] + jac[4]*resid[1] + jac[7]*resid[2],
-    steep[2] = jac[2]*resid[0] + jac[5]*resid[1] + jac[8]*resid[2];
-    sr[0] = steep[0]*p[i].r[0],
-    sr[1] = steep[1]*p[i].r[1],
-    sr[2] = steep[2]*p[i].r[2];
-    /* check prior step */
-    if(reject_prior_step_q(out+i,resid,p+i,tol)) continue;
-    /* check constraints */
-    if(sr[0]<0) {
-      if(sr[1]<0) {
-        if(sr[2]<0) goto findpt_pt_vol;
-        else { d1=0,d2=1,dn=2, hi0=0,hi1=1,hi2=3; goto findpt_pt_face; }
-      }
-      else if(sr[2]<0) {d1=2,d2=0,dn=1, hi0=5,hi1=2,hi2=0; goto findpt_pt_face;}
-      else { de=0,dn1=1,dn2=2, hi0=0; goto findpt_pt_edge; }
-    }
-    else if(sr[1]<0) {
-      if(sr[2]<0) { d1=1,d2=2,dn=0, hi0=3,hi1=4,hi2=5; goto findpt_pt_face; }
-      else { de=1,dn1=2,dn2=0, hi0=3; goto findpt_pt_edge; }
-    }
-    else if(sr[2]<0) { de=2,dn1=0,dn2=1, hi0=5; goto findpt_pt_edge; }
-    out[i].r[0]=p[i].r[0],out[i].r[1]=p[i].r[1],out[i].r[2]=p[i].r[2];
-    out[i].dist2p=0;
-    out[i].flags = pflag | CONVERGED_FLAG;
-    continue;
-    findpt_pt_vol:
-      newton_vol(out+i, jac,resid, p+i, tol);
-      continue;
-    findpt_pt_face: {
-      double rh[3];
-      rh[0] = resid[0]*hes[hi0]+resid[1]*hes[6+hi0]+resid[2]*hes[12+hi0],
-      rh[1] = resid[0]*hes[hi1]+resid[1]*hes[6+hi1]+resid[2]*hes[12+hi1],
-      rh[2] = resid[0]*hes[hi2]+resid[1]*hes[6+hi2]+resid[2]*hes[12+hi2];
-      newton_face(out+i, jac,rh,resid, d1,d2,dn,
-                  pflag&(3u<<(2*dn)), p+i, tol);
-    } continue;
-    findpt_pt_edge: {
-      const double rh =
-        resid[0]*hes[hi0]+resid[1]*hes[6+hi0]+resid[2]*hes[12+hi0];
-      newton_edge(out+i, jac,rh,resid, de,dn1,dn2,
-                  pflag&~(3u<<(2*de)), p+i, tol);
-    } continue;
-  }
-}
-
-static void seed(struct findpts_el_data_3 *const fd,
-                 struct findpts_el_pt_3 *const pt, const unsigned npt)
-{
-  struct findpts_el_pt_3 *p, *const pe = pt+npt;
-  const unsigned nr=fd->n[0], ns=fd->n[1], nt=fd->n[2];
-  unsigned i,j,k, ii=0;
-  for(p=pt;p!=pe;++p) p->dist2=DBL_MAX;
-  for(k=0;k<nt;++k) {
-    const double zt=fd->z[2][k];
-    for(j=0;j<ns;++j) {
-      const double zs=fd->z[1][j];
-      for(i=0;i<nr;++i) {
-        const double zr=fd->z[0][i];
-        const double x=fd->x[0][ii], y=fd->x[1][ii], z=fd->x[2][ii];
-        ++ii;
-        for(p=pt;p!=pe;++p) {
-          const double dx=p->x[0]-x,dy=p->x[1]-y,dz=p->x[2]-z;
-          const double dist2 = dx*dx+dy*dy+dz*dz;
-          if(p->dist2<=dist2) continue;
-          p->dist2=dist2;
-          p->r[0]=zr, p->r[1]=zs, p->r[2]=zt;
-        }
-      }
-    }
-  }
-}
-
-void findpts_el_3(struct findpts_el_data_3 *const fd, const unsigned npt,
-                  const double tol)
-{
-  findpt_fun *const fun[4] =
-    { &findpt_vol, &findpt_face, &findpt_edge, &findpt_pt };
-  struct findpts_el_pt_3 *const pbuf = fd->p, *const pstart = fd->p + npt;
-  unsigned nconv = npt;
-  unsigned step = 0;
-  unsigned count[27] = { 0,0,0, 0,0,0, 0,0,0,
-                         0,0,0, 0,0,0, 0,0,0,
-                         0,0,0, 0,0,0, 0,0,0 } ;
-  count[0] = npt;
-  seed(fd,pbuf,npt);
-  { unsigned i;
-    for(i=0;i<npt;++i) {
-      pstart[i].x[0]=pbuf[i].x[0];
-      pstart[i].x[1]=pbuf[i].x[1];
-      pstart[i].x[2]=pbuf[i].x[2];
-      pstart[i].r[0]=pbuf[i].r[0];
-      pstart[i].r[1]=pbuf[i].r[1];
-      pstart[i].r[2]=pbuf[i].r[2];
-      pstart[i].index=i,pstart[i].flags=0;
-      pstart[i].dist2=DBL_MAX,pstart[i].dist2p=0,pstart[i].tr=1;
-    }
-  }
-  while(nconv && step++ < 50) {
-    /* advance each group of points */
-    struct findpts_el_pt_3 *p, *const pe=pstart+nconv, *pout; unsigned pn;
-
-#if DIAGNOSTICS_ITERATIONS>1
-    { unsigned i;
-      printf("findpts_el_3 Newton step (%u), %u unconverged:\n ", step,nconv);
-      for(i=0;i<27;++i) printf(" %u",count[i]);
-      printf("\n");
-    }
-#endif
-#ifdef DIAGNOSTICS_3
-    if(step==50) {
-      unsigned d, i, n=fd->n[0]*fd->n[1]*fd->n[2];
-      printf("geometry:\n{\n");
-      for(d=0;d<3;++d) {
-        printf(" {\n");
-        for(i=0;i<n;++i)
-          printf("  %.15g%s\n",fd->x[d][i],i==n-1?"":",");
-        printf(" }%s\n",d==3-1?"":",");
-      }
-      printf("}\n");
-    }
-#endif
-
-    for(p=pstart,pout=pbuf; p!=pe; p+=pn,pout+=pn) {
-      const unsigned pflags = p->flags & FLAG_MASK;
-      pn = count[pt_flags_to_bin_noC(pflags)];
-      fun[num_constrained(pflags)](pout, fd, p,pn, tol);
-    }
-    /* group points by contsraints */
-    {
-      unsigned offset[28] = { 0,0,0, 0,0,0, 0,0,0,
-                              0,0,0, 0,0,0, 0,0,0,
-                              0,0,0, 0,0,0, 0,0,0, 0 };
-      struct findpts_el_pt_3 *const ppe = pbuf+nconv;
-      for(pout=pbuf; pout!=ppe; ++pout)
-        ++offset[pt_flags_to_bin(pout->flags & FLAG_MASK)];
-      {
-        unsigned i; unsigned sum=0;
-        for(i=0;i<27;++i) {
-          unsigned ci=offset[i]; count[i]=ci, offset[i]=sum, sum+=ci;
-        }
-        nconv = offset[27] = sum; /* last bin is converged; forget it */
-      }
-      for(pout=pbuf; pout!=pe; ++pout)
-        pstart[offset[pt_flags_to_bin(pout->flags & FLAG_MASK)]++] = *pout;
-    }
-  }
-  { struct findpts_el_pt_3 *p, *const pe=pstart+npt;
-    for(p=pstart;p!=pe;++p)
-      pbuf[p->index]=*p, pbuf[p->index].flags&=FLAG_MASK;
-  }
-#if DIAGNOSTICS_ITERATIONS
-  printf("findpts_el_3 took %u steps\n ", step);
-#endif
-}
-
-void findpts_el_eval_3(
-        double *const out_base, const unsigned out_stride,
-  const double *const   r_base, const unsigned   r_stride, const unsigned pn,
-  const double *const in, struct findpts_el_data_3 *const fd)
-{
-  const unsigned nr=fd->n[0],ns=fd->n[1],nt=fd->n[2],
-                 nrs=nr*ns;
-  double *const wtrs = fd->work, *const wtt = wtrs+(nr+ns)*pn,
-         *const slice = wtt+nt*pn, *const temp = slice + pn*nrs;
-  unsigned i; const double *r; double *out;
-  for(i=0,r=r_base;i<pn;++i) {
-    fd->lag[0](wtrs+i*(nr+ns)   , fd->lag_data[0], nr, 0, r[0]);
-    fd->lag[1](wtrs+i*(nr+ns)+nr, fd->lag_data[1], ns, 0, r[1]);
-    fd->lag[2](wtt +i*nt        , fd->lag_data[2], nt, 0, r[2]);
-    r = (const double*)((const char*)r + r_stride);
-  }
-
-  tensor_mxm(slice,nrs, in,nt, wtt,pn);
-  for(i=0,out=out_base;i<pn;++i) {
-    const double *const wtrs_i = wtrs+i*(nr+ns), *const slice_i = slice+i*nrs;
-    *out = tensor_i2(wtrs_i,nr, wtrs_i+nr,ns, slice_i, temp);
-    out = (double*)((char*)out + out_stride);
-  }
-}
-
diff --git a/3rdParty/gslib/src/findpts_imp.h b/3rdParty/gslib/src/findpts_imp.h
deleted file mode 100644
index 85b936be0..000000000
--- a/3rdParty/gslib/src/findpts_imp.h
+++ /dev/null
@@ -1,470 +0,0 @@
-
-#define obbox           TOKEN_PASTE(obbox_,D)
-#define local_hash_data TOKEN_PASTE(findpts_local_hash_data_,D)
-#define hash_data       TOKEN_PASTE(findpts_hash_data_,D)
-#define hash_index      TOKEN_PASTE(hash_index_       ,D)
-#define hash_setfac     TOKEN_PASTE(hash_setfac_      ,D)
-#define hash_range      TOKEN_PASTE(hash_range_       ,D)
-#define hash_bb         TOKEN_PASTE(hash_bb_          ,D)
-#define set_local_mask  TOKEN_PASTE(set_local_mask_   ,D)
-#define fill_hash       TOKEN_PASTE(fill_hash_        ,D)
-#define table_from_hash TOKEN_PASTE(table_from_hash_  ,D)
-#define hash_build      TOKEN_PASTE(hash_build_       ,D)
-#define hash_free       TOKEN_PASTE(hash_free_        ,D)
-
-#define findpts_local_data  TOKEN_PASTE(findpts_local_data_,D)
-#define findpts_local_setup TOKEN_PASTE(PREFIXED_NAME(findpts_local_setup_),D)
-#define findpts_local_free  TOKEN_PASTE(PREFIXED_NAME(findpts_local_free_ ),D)
-#define findpts_local       TOKEN_PASTE(PREFIXED_NAME(findpts_local_      ),D)
-#define findpts_local_eval  TOKEN_PASTE(PREFIXED_NAME(findpts_local_eval_ ),D)
-#define findpts_data        TOKEN_PASTE(findpts_data_,D)
-#define src_pt              TOKEN_PASTE(src_pt_      ,D)
-#define out_pt              TOKEN_PASTE(out_pt_      ,D)
-#define eval_src_pt         TOKEN_PASTE(eval_src_pt_ ,D)
-#define eval_out_pt         TOKEN_PASTE(eval_out_pt_ ,D)
-#define setup_aux           TOKEN_PASTE(setup_aux_,D)
-#define findpts_setup       TOKEN_PASTE(PREFIXED_NAME(findpts_setup_),D)
-#define findpts_free        TOKEN_PASTE(PREFIXED_NAME(findpts_free_ ),D)
-#define findpts             TOKEN_PASTE(PREFIXED_NAME(findpts_      ),D)
-#define findpts_eval        TOKEN_PASTE(PREFIXED_NAME(findpts_eval_ ),D)
-
-struct hash_data {
-  ulong hash_n;
-  struct dbl_range bnd[D];
-  double fac[D];
-  uint *offset;
-};
-
-static ulong hash_index(const struct hash_data *p, const double x[D])
-{
-  const ulong n = p->hash_n;
-  return ( WHEN_3D( hash_index_aux(p->bnd[2].min,p->fac[2],n,x[2])  *n )
-                   +hash_index_aux(p->bnd[1].min,p->fac[1],n,x[1]) )*n
-                   +hash_index_aux(p->bnd[0].min,p->fac[0],n,x[0]);
-}
-
-static void hash_setfac(struct hash_data *p, const ulong n)
-{
-  unsigned d;
-  p->hash_n = n;
-  for(d=0;d<D;++d) p->fac[d] = n/(p->bnd[d].max-p->bnd[d].min);
-}
-
-static struct ulong_range hash_range(const struct hash_data *p, unsigned d,
-                                     const struct dbl_range r)
-{
-  struct ulong_range ir;
-  const slong i0 = lfloor( (r.min - p->bnd[d].min) * p->fac[d] );
-  const ulong i1 = lceil ( (r.max - p->bnd[d].min) * p->fac[d] );
-  ir.min = i0<0 ? 0 : i0;
-  ir.max = i1<p->hash_n ? i1 : p->hash_n;
-  if(ir.max==ir.min) ++ir.max;
-  return ir;
-}
-
-static void hash_bb(struct hash_data *p, const struct local_hash_data *lp,
-                    const struct comm *comm, uint hash_size)
-{
-  double x[D], buf[D], ghs;
-  unsigned d;
-  for(d=0;d<D;++d) x[d]=lp->bnd[d].min;
-  comm_allreduce(comm,gs_double,gs_min,x,D,buf);
-  for(d=0;d<D;++d) p->bnd[d].min=x[d];
-
-  for(d=0;d<D;++d) x[d]=lp->bnd[d].max;
-  comm_allreduce(comm,gs_double,gs_max,x,D,buf);
-  for(d=0;d<D;++d) p->bnd[d].max=x[d];
-
-  ghs = hash_size; comm_allreduce(comm,gs_double,gs_add,&ghs,1,buf);
-  hash_setfac(p,lceil(pow(ghs,1./D)));
-
-  #ifdef DIAGNOSTICS
-  if(comm->id==0) {
-    printf("global bounding box (%g^%u):\n",(double)p->hash_n,D);
-    for(d=0;d<D;++d) printf("  [%.17g, %.17g]\n",p->bnd[d].min,p->bnd[d].max);
-  }
-  #endif
-}
-
-static void set_local_mask(unsigned char *const local_mask,
-                           const ulong local_base[D], const uint local_n[D],
-                           const struct hash_data *const p,
-                           const struct obbox *const obb, const uint nel
-                          )
-{
-  uint el;
-  for(el=0;el<nel;++el) {
-    struct ulong_range ir[D]; unsigned d;
-    for(d=0;d<D;++d) ir[d]=hash_range(p,d,obb[el].x[d]);
-    #define FOR_LOOP() do { ulong i,j; WHEN_3D(ulong k;) \
-      WHEN_3D(for(k=ir[2].min;k<ir[2].max;++k)) \
-              for(j=ir[1].min;j<ir[1].max;++j) \
-              for(i=ir[0].min;i<ir[0].max;++i) \
-                set_bit(local_mask, (WHEN_3D((k-local_base[2]) *local_n[1]) \
-                                            +(j-local_base[1]))*local_n[0] \
-                                            +(i-local_base[0]) \
-                       ); \
-    } while(0)
-    FOR_LOOP();
-    #undef FOR_LOOP
-  }
-}
-
-static void fill_hash(struct array *const hash,
-                      const unsigned char *const local_mask,
-                      const ulong local_base[D], const uint local_n[D],
-                      const ulong hn, const uint np)
-{
-  struct proc_index *hp = hash->ptr;
-  #define FOR_LOOP() do { uint bit=0,i,j; WHEN_3D(uint k;) \
-    WHEN_3D(for(k=0;k<local_n[2];++k)) \
-            for(j=0;j<local_n[1];++j) \
-            for(i=0;i<local_n[0];++i) { ulong hi; \
-              if(get_bit(local_mask,bit++)==0) continue; \
-              hi = (WHEN_3D( (local_base[2]+k) *hn ) \
-                            +(local_base[1]+j))*hn \
-                            +(local_base[0]+i); \
-              hp->proc = hi%np, hp->index = hi/np; \
-              ++hp; \
-            } \
-  } while(0)
-  FOR_LOOP();
-  #undef FOR_LOOP
-}
-
-static void table_from_hash(struct hash_data *const p,
-                            struct array *const hash,
-                            const uint np, buffer *buf)
-{
-  const ulong hn = p->hash_n;
-  ulong hnd;
-  uint ncell, *offset, i, next_cell;
-  const struct proc_index *const hp = hash->ptr;
-  const uint n = hash->n;
-  hnd = hn*hn; WHEN_3D(hnd*=hn);
-  ncell = (hnd-1)/np+1;
-  p->offset = offset = tmalloc(uint,ncell+1+n);
-  sarray_sort(struct proc_index,hash->ptr,n, index,0, buf);
-  next_cell = 0;
-  for(i=0;i<n;++i) {
-    const uint cell = hp[i].index;
-    const uint off = ncell+1+i;
-    offset[off]=hp[i].proc;
-    while(next_cell<=cell ) offset[next_cell++]=off;
-  }
-  { const uint off = ncell+1+i;
-    while(next_cell<=ncell) offset[next_cell++]=off;
-  }
-}
-
-static void hash_build(struct hash_data *const p,
-                       const struct local_hash_data *const lp,
-                       const struct obbox *const obb, const uint nel,
-                       const uint hash_size,
-                       struct crystal *cr)
-{
-  ulong local_base[D]; uint local_n[D], local_ntot=1;
-  unsigned char *local_mask;
-  struct array hash; uint nc;
-  unsigned d;
-  hash_bb(p,lp,&cr->comm,hash_size);
-  for(d=0;d<D;++d) {
-    struct ulong_range rng=hash_range(p,d,lp->bnd[d]);
-    local_base[d]=rng.min;
-    local_n[d]=rng.max-rng.min;
-    local_ntot*=local_n[d];
-    #ifdef DIAGNOSTICS
-    if(cr->comm.id==0) {
-      printf("local_range %u: %lu to %lu\n",
-             d,(unsigned long)rng.min,(unsigned long)rng.max);
-    }
-    #endif
-  }
-  local_mask = tcalloc(unsigned char, (local_ntot+CHAR_BIT-1)/CHAR_BIT);
-  set_local_mask(local_mask,local_base,local_n,p,obb,nel);
-  nc=count_bits(local_mask,(local_ntot+CHAR_BIT-1)/CHAR_BIT);
-  #ifdef DIAGNOSTICS
-  printf("findpts_hash(%u): local cells : %u / %u\n",cr->comm.id,nc,local_ntot);
-  #endif
-  array_init(struct proc_index,&hash,nc), hash.n=nc;
-  fill_hash(&hash,local_mask,local_base,local_n,p->hash_n,cr->comm.np);
-  free(local_mask);
-  sarray_transfer(struct proc_index,&hash,proc,1,cr);
-  table_from_hash(p,&hash,cr->comm.np,&cr->data);
-  array_free(&hash);
-}
-
-static void hash_free(struct hash_data *p) { free(p->offset); }
-
-struct findpts_data {
-  struct crystal cr;
-  struct findpts_local_data local;
-  struct hash_data hash;
-};
-
-static void setup_aux(
-  struct findpts_data *const fd,
-  const double *const elx[D],
-  const unsigned n[D], const uint nel,
-  const unsigned m[D], const double bbox_tol,
-  const uint local_hash_size, const uint global_hash_size,
-  const unsigned npt_max, const double newt_tol)
-{
-  findpts_local_setup(&fd->local,elx,n,nel,m,bbox_tol,local_hash_size,
-                      npt_max, newt_tol);
-  hash_build(&fd->hash,&fd->local.hd,fd->local.obb,nel,
-             global_hash_size,&fd->cr);
-}
-
-struct findpts_data *findpts_setup(
-  const struct comm *const comm,
-  const double *const elx[D],
-  const unsigned n[D], const uint nel,
-  const unsigned m[D], const double bbox_tol,
-  const uint local_hash_size, const uint global_hash_size,
-  const unsigned npt_max, const double newt_tol)
-{
-  struct findpts_data *const fd = tmalloc(struct findpts_data, 1);
-  crystal_init(&fd->cr,comm);
-  setup_aux(fd,elx,n,nel,m,bbox_tol,
-            local_hash_size,global_hash_size,npt_max,newt_tol);
-  return fd;
-}
-
-void findpts_free(struct findpts_data *fd)
-{
-  hash_free(&fd->hash);
-  findpts_local_free(&fd->local);
-  crystal_free(&fd->cr);
-  free(fd);
-}
-
-struct src_pt { double x[D]; uint index, proc; };
-struct out_pt { double r[D], dist2; uint index, code, el, proc; };
-
-void findpts(      uint   *const  code_base   , const unsigned  code_stride   ,
-                   uint   *const  proc_base   , const unsigned  proc_stride   ,
-                   uint   *const    el_base   , const unsigned    el_stride   ,
-                   double *const     r_base   , const unsigned     r_stride   ,
-                   double *const dist2_base   , const unsigned dist2_stride   ,
-             const double *const     x_base[D], const unsigned     x_stride[D],
-             const uint npt, struct findpts_data *const fd)
-{
-  const uint np = fd->cr.comm.np, id=fd->cr.comm.id;
-  struct array hash_pt, src_pt, out_pt;
-  /* look locally first */
-  if(npt) findpts_local( code_base, code_stride,
-                           el_base,   el_stride,
-                            r_base,    r_stride,
-                        dist2_base,dist2_stride,
-                            x_base,    x_stride,
-                        npt,&fd->local,&fd->cr.data);
-  /* send unfound and border points to global hash cells */
-  {
-    uint index;
-    uint *code=code_base, *proc=proc_base;
-    const double *xp[D];
-    struct src_pt *pt;
-    unsigned d; for(d=0;d<D;++d) xp[d]=x_base[d];
-    array_init(struct src_pt, &hash_pt, npt), pt=hash_pt.ptr;
-    for(index=0;index<npt;++index) {
-      double x[D]; for(d=0;d<D;++d) x[d]=*xp[d];
-      *proc = id;
-      if(*code!=CODE_INTERNAL) {
-        const uint hi = hash_index(&fd->hash,x);
-        unsigned dd;
-        for(dd=0;dd<D;++dd) pt->x[dd]=x[dd];
-        pt->index=index;
-        pt->proc=hi%np;
-        ++pt;
-      }
-      for(d=0;d<D;++d)
-      xp[d] = (const double*)((const char*)xp[d]+   x_stride[d]);
-      code  =         (uint*)(      (char*)code +code_stride   );
-      proc  =         (uint*)(      (char*)proc +proc_stride   );
-    }
-    hash_pt.n = pt - (struct src_pt*)hash_pt.ptr;
-    sarray_transfer(struct src_pt,&hash_pt,proc,1,&fd->cr);
-  }
-  /* look up points in hash cells, route to possible procs */
-  {
-    const uint *const hash_offset = fd->hash.offset;
-    uint count=0, *proc, *proc_p;
-    const struct src_pt *p = hash_pt.ptr, *const pe = p+hash_pt.n;
-    struct src_pt *q;
-    for(;p!=pe;++p) {
-      const uint hi = hash_index(&fd->hash,p->x)/np;
-      const uint i = hash_offset[hi], ie = hash_offset[hi+1];
-      count += ie-i;
-    }
-    proc_p = proc = tmalloc(uint,count);
-    array_init(struct src_pt,&src_pt,count), q=src_pt.ptr;
-    for(p=hash_pt.ptr;p!=pe;++p) {
-      const uint hi = hash_index(&fd->hash,p->x)/np;
-      uint i = hash_offset[hi]; const uint ie = hash_offset[hi+1];
-      for(;i!=ie;++i) {
-        const uint pp = hash_offset[i];
-        if(pp==p->proc) continue; /* don't send back to source proc */
-        *proc_p++ = pp;
-        *q++ = *p;
-      }
-    }
-    array_free(&hash_pt);
-    src_pt.n = proc_p-proc;
-    #ifdef DIAGNOSTICS
-    printf("(proc %u) hashed; routing %u/%u\n",id,(unsigned)src_pt.n,count);
-    #endif
-    sarray_transfer_ext(struct src_pt,&src_pt,proc,sizeof(uint),&fd->cr);
-    free(proc);
-  }
-  /* look for other procs' points, send back */
-  {
-    uint n=src_pt.n;
-    const struct src_pt *spt;
-    struct out_pt *opt;
-    array_init(struct out_pt,&out_pt,n), out_pt.n=n;
-    spt=src_pt.ptr, opt=out_pt.ptr;
-    for(;n;--n,++spt,++opt) opt->index=spt->index,opt->proc=spt->proc;
-    spt=src_pt.ptr, opt=out_pt.ptr;
-    if(src_pt.n) {
-      const double *spt_x_base[D]; unsigned spt_x_stride[D];
-      unsigned d; for(d=0;d<D;++d) spt_x_base[d] = spt[0].x+d,
-                                   spt_x_stride[d] = sizeof(struct src_pt);
-      findpts_local(&opt[0].code ,sizeof(struct out_pt),
-                    &opt[0].el   ,sizeof(struct out_pt),
-                     opt[0].r    ,sizeof(struct out_pt),
-                    &opt[0].dist2,sizeof(struct out_pt),
-                     spt_x_base  ,spt_x_stride,
-                    src_pt.n,&fd->local,&fd->cr.data);
-    }
-    array_free(&src_pt);
-    /* group by code to eliminate unfound points */
-    sarray_sort(struct out_pt,opt,out_pt.n, code,0, &fd->cr.data);
-    n=out_pt.n; while(n && opt[n-1].code==CODE_NOT_FOUND) --n;
-    out_pt.n=n;
-    #ifdef DIAGNOSTICS
-    printf("(proc %u) sending back %u found points\n",id,(unsigned)out_pt.n);
-    #endif
-    sarray_transfer(struct out_pt,&out_pt,proc,1,&fd->cr);
-  }
-  /* merge remote results with user data */
-  {
-    #define  AT(T,var,i) (T*)((char*)var##_base+(i)*var##_stride)
-    uint n=out_pt.n;
-    struct out_pt *opt;
-    for(opt=out_pt.ptr;n;--n,++opt) {
-      const uint index = opt->index;
-      uint *code = AT(uint,code,index);
-      double *dist2 = AT(double,dist2,index);
-      if(*code==CODE_INTERNAL) continue;
-      if(*code==CODE_NOT_FOUND
-         || opt->code==CODE_INTERNAL
-         || opt->dist2<*dist2) {
-        double *r = AT(double,r,index);
-        uint  *el = AT(uint,el,index), *proc = AT(uint,proc,index);
-        unsigned d; for(d=0;d<D;++d) r[d]=opt->r[d];
-        *dist2 = opt->dist2;
-        *proc = opt->proc;
-        *el = opt->el;
-        *code = opt->code;
-      }
-    }
-    array_free(&out_pt);
-    #undef AT
-  }
-}
-
-struct eval_src_pt { double r[D]; uint index, proc, el; };
-struct eval_out_pt { double out; uint index, proc; };
-
-void findpts_eval(
-        double *const  out_base, const unsigned  out_stride,
-  const uint   *const code_base, const unsigned code_stride,
-  const uint   *const proc_base, const unsigned proc_stride,
-  const uint   *const   el_base, const unsigned   el_stride,
-  const double *const    r_base, const unsigned    r_stride,
-  const uint npt,
-  const double *const in, struct findpts_data *const fd)
-{
-  struct array src, outpt;
-  /* copy user data, weed out unfound points, send out */
-  {
-    uint index;
-    const uint *code=code_base, *proc=proc_base, *el=el_base;
-    const double *r=r_base;
-    struct eval_src_pt *pt;
-    array_init(struct eval_src_pt, &src, npt), pt=src.ptr;
-    for(index=0;index<npt;++index) {
-      if(*code!=CODE_NOT_FOUND) {
-        unsigned d;
-        for(d=0;d<D;++d) pt->r[d]=r[d];
-        pt->index=index;
-        pt->proc=*proc;
-        pt->el=*el;
-        ++pt;
-      }
-      r    = (const double*)((const char*)r   +   r_stride);
-      code = (const   uint*)((const char*)code+code_stride);
-      proc = (const   uint*)((const char*)proc+proc_stride);
-      el   = (const   uint*)((const char*)el  +  el_stride);
-    }
-    src.n = pt - (struct eval_src_pt*)src.ptr;
-    sarray_transfer(struct eval_src_pt,&src,proc,1,&fd->cr);
-  }
-  /* evaluate points, send back */
-  {
-    uint n=src.n;
-    const struct eval_src_pt *spt;
-    struct eval_out_pt *opt;
-    /* group points by element */
-    sarray_sort(struct eval_src_pt,src.ptr,n, el,0, &fd->cr.data);
-    array_init(struct eval_out_pt,&outpt,n), outpt.n=n;
-    spt=src.ptr, opt=outpt.ptr;
-    for(;n;--n,++spt,++opt) opt->index=spt->index,opt->proc=spt->proc;
-    spt=src.ptr, opt=outpt.ptr;
-    findpts_local_eval(&opt->out ,sizeof(struct eval_out_pt),
-                       &spt->el  ,sizeof(struct eval_src_pt),
-                        spt->r   ,sizeof(struct eval_src_pt),
-                       src.n, in,&fd->local);
-    array_free(&src);
-    sarray_transfer(struct eval_out_pt,&outpt,proc,1,&fd->cr);
-  }
-  /* copy results to user data */
-  {
-    #define  AT(T,var,i) (T*)((char*)var##_base+(i)*var##_stride)
-    uint n=outpt.n;
-    struct eval_out_pt *opt;
-    for(opt=outpt.ptr;n;--n,++opt) *AT(double,out,opt->index)=opt->out;
-    array_free(&outpt);
-    #undef AT
-  }
-}
-
-#undef findpts_eval
-#undef findpts
-#undef findpts_free
-#undef findpts_setup
-#undef setup_aux
-#undef eval_out_pt
-#undef eval_src_pt
-#undef out_pt
-#undef src_pt
-#undef findpts_data
-#undef findpts_local_eval
-#undef findpts_local
-#undef findpts_local_free
-#undef findpts_local_setup
-#undef findpts_local_data
-
-#undef hash_free
-#undef hash_build
-#undef table_from_hash
-#undef fill_hash
-#undef set_local_mask
-#undef hash_bb
-#undef hash_range
-#undef hash_setfac
-#undef hash_index
-#undef hash_data
-#undef local_hash_data
-#undef obbox
diff --git a/3rdParty/gslib/src/findpts_local.c b/3rdParty/gslib/src/findpts_local.c
deleted file mode 100644
index ea8096019..000000000
--- a/3rdParty/gslib/src/findpts_local.c
+++ /dev/null
@@ -1,52 +0,0 @@
-#include <stdio.h>
-
-#include <stddef.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-#include "c99.h"
-#include "types.h"
-#include "name.h"
-#include "fail.h"
-#include "mem.h"
-#include "obbox.h"
-#include "poly.h"
-#include "sort.h"
-#include "sarray_sort.h"
-#include "findpts_el.h"
-
-struct uint_range { uint min, max; };
-struct index_el { uint index, el; };
-
-static struct dbl_range dbl_range_merge(struct dbl_range a, struct dbl_range b)
-{
-  struct dbl_range m;
-  m.min = b.min<a.min?b.min:a.min,
-  m.max = a.max>b.max?a.max:b.max;
-  return m;
-}
-
-static sint ifloor(double x) { return floor(x); }
-static sint iceil (double x) { return ceil (x); }
-
-static uint hash_index_aux(double low, double fac, uint n, double x)
-{
-  const sint i = ifloor((x-low)*fac);
-  return i<0 ? 0 : (n-1<(uint)i ? n-1 : (uint)i);
-}
-
-#define CODE_INTERNAL 0
-#define CODE_BORDER 1
-#define CODE_NOT_FOUND 2
-
-#define D 2
-#define WHEN_3D(a)
-#include "findpts_local_imp.h"
-#undef WHEN_3D
-#undef D
-
-#define D 3
-#define WHEN_3D(a) a
-#include "findpts_local_imp.h"
-#undef WHEN_3D
-#undef D
diff --git a/3rdParty/gslib/src/findpts_local.h b/3rdParty/gslib/src/findpts_local.h
deleted file mode 100644
index 88c42d33a..000000000
--- a/3rdParty/gslib/src/findpts_local.h
+++ /dev/null
@@ -1,96 +0,0 @@
-#ifndef FINDPTS_LOCAL_H
-#define FINDPTS_LOCAL_H
-
-#if !defined(MEM_H) || !defined(FINDPTS_EL_H) || !defined(OBBOX_H)
-#warning "findpts_local.h" requires "mem.h", "findpts_el.h", "obbox.h"
-#endif
-
-#define findpts_local_setup_2   PREFIXED_NAME(findpts_local_setup_2)
-#define findpts_local_free_2    PREFIXED_NAME(findpts_local_free_2 )
-#define findpts_local_2         PREFIXED_NAME(findpts_local_2      )
-#define findpts_local_eval_2    PREFIXED_NAME(findpts_local_eval_2 )
-
-struct findpts_local_hash_data_2 {
-  uint hash_n;
-  struct dbl_range bnd[2];
-  double fac[2];
-  uint *offset;
-  uint max;
-};
-
-struct findpts_local_data_2 {
-  unsigned ntot;
-  const double *elx[2];
-  struct obbox_2 *obb;
-  struct findpts_local_hash_data_2 hd;
-  struct findpts_el_data_2 fed;
-  double tol;
-};
-
-void findpts_local_setup_2(struct findpts_local_data_2 *const fd,
-                           const double *const elx[2],
-                           const unsigned n[2], const uint nel,
-                           const unsigned m[2], const double bbox_tol,
-                           const uint max_hash_size,
-                           const unsigned npt_max, const double newt_tol);
-void findpts_local_free_2(struct findpts_local_data_2 *const fd);
-void findpts_local_2(
-        uint   *const  code_base   , const unsigned  code_stride   ,
-        uint   *const    el_base   , const unsigned    el_stride   ,
-        double *const     r_base   , const unsigned     r_stride   ,
-        double *const dist2_base   , const unsigned dist2_stride   ,
-  const double *const     x_base[2], const unsigned     x_stride[2],
-  const uint npt, struct findpts_local_data_2 *const fd,
-  buffer *buf);
-void findpts_local_eval_2(
-        double *const out_base, const unsigned out_stride,
-  const uint   *const  el_base, const unsigned  el_stride,
-  const double *const   r_base, const unsigned   r_stride,
-  const uint npt,
-  const double *const in, struct findpts_local_data_2 *const fd);
-
-#define findpts_local_setup_3   PREFIXED_NAME(findpts_local_setup_3)
-#define findpts_local_free_3    PREFIXED_NAME(findpts_local_free_3 )
-#define findpts_local_3         PREFIXED_NAME(findpts_local_3      )
-#define findpts_local_eval_3    PREFIXED_NAME(findpts_local_eval_3 )
-
-struct findpts_local_hash_data_3 {
-  uint hash_n;
-  struct dbl_range bnd[3];
-  double fac[3];
-  uint *offset;
-  uint max;
-};
-
-struct findpts_local_data_3 {
-  unsigned ntot;
-  const double *elx[3];
-  struct obbox_3 *obb;
-  struct findpts_local_hash_data_3 hd;
-  struct findpts_el_data_3 fed;
-  double tol;
-};
-
-void findpts_local_setup_3(struct findpts_local_data_3 *const fd,
-                           const double *const elx[3],
-                           const unsigned n[3], const uint nel,
-                           const unsigned m[3], const double bbox_tol,
-                           const uint max_hash_size,
-                           const unsigned npt_max, const double newt_tol);
-void findpts_local_free_3(struct findpts_local_data_3 *const fd);
-void findpts_local_3(
-        uint   *const  code_base   , const unsigned  code_stride   ,
-        uint   *const    el_base   , const unsigned    el_stride   ,
-        double *const     r_base   , const unsigned     r_stride   ,
-        double *const dist2_base   , const unsigned dist2_stride   ,
-  const double *const     x_base[3], const unsigned     x_stride[3],
-  const uint npt, struct findpts_local_data_3 *const fd,
-  buffer *buf);
-void findpts_local_eval_3(
-        double *const out_base, const unsigned out_stride,
-  const uint   *const  el_base, const unsigned  el_stride,
-  const double *const   r_base, const unsigned   r_stride,
-  const uint npt,
-  const double *const in, struct findpts_local_data_3 *const fd);
-
-#endif
diff --git a/3rdParty/gslib/src/findpts_local_imp.h b/3rdParty/gslib/src/findpts_local_imp.h
deleted file mode 100644
index e1e77427e..000000000
--- a/3rdParty/gslib/src/findpts_local_imp.h
+++ /dev/null
@@ -1,388 +0,0 @@
-
-#define obbox               TOKEN_PASTE(obbox_             ,D)
-#define obbox_calc          TOKEN_PASTE(PREFIXED_NAME(obbox_calc_),D)
-#define obbox_test          TOKEN_PASTE(obbox_test_        ,D)
-#define hash_data           TOKEN_PASTE(findpts_local_hash_data_,D)
-#define hash_index          TOKEN_PASTE(hash_index_        ,D)
-#define hash_setfac         TOKEN_PASTE(hash_setfac_       ,D)
-#define hash_range          TOKEN_PASTE(hash_range_        ,D)
-#define hash_count          TOKEN_PASTE(hash_count_        ,D)
-#define hash_opt_size       TOKEN_PASTE(hash_opt_size_     ,D)
-#define hash_bb             TOKEN_PASTE(hash_bb_           ,D)
-#define hash_build          TOKEN_PASTE(hash_build_        ,D)
-#define hash_free           TOKEN_PASTE(hash_free_         ,D)
-#define findpts_el_data     TOKEN_PASTE(findpts_el_data_   ,D)
-#define findpts_el_pt       TOKEN_PASTE(findpts_el_pt_     ,D)
-#define findpts_el_setup    TOKEN_PASTE(PREFIXED_NAME(findpts_el_setup_),D)
-#define findpts_el_free     TOKEN_PASTE(PREFIXED_NAME(findpts_el_free_ ),D)
-#define findpts_el          TOKEN_PASTE(PREFIXED_NAME(findpts_el_      ),D)
-#define findpts_el_eval     TOKEN_PASTE(PREFIXED_NAME(findpts_el_eval_ ),D)
-#define findpts_el_start    TOKEN_PASTE(findpts_el_start_  ,D)
-#define findpts_el_points   TOKEN_PASTE(findpts_el_points_ ,D)
-#define findpts_local_data  TOKEN_PASTE(findpts_local_data_,D)
-#define map_points_to_els   TOKEN_PASTE(map_points_to_els_ ,D)
-#define findpts_local_setup TOKEN_PASTE(PREFIXED_NAME(findpts_local_setup_),D)
-#define findpts_local_free  TOKEN_PASTE(PREFIXED_NAME(findpts_local_free_ ),D)
-#define findpts_local       TOKEN_PASTE(PREFIXED_NAME(findpts_local_      ),D)
-#define findpts_local_eval  TOKEN_PASTE(PREFIXED_NAME(findpts_local_eval_ ),D)
-
-/*--------------------------------------------------------------------------
-   Point to Possible Elements Hashing
-
-   Initializing the data:
-     uint nel;        // number of elements
-     uint max_size = nr*ns*nt*nel; // maximum size of hash table
-     struct obbox *obb = ...; // bounding boxes for elements
-
-     hash_data data;
-     hash_build(&data, obb, nel, max_size);
-
-   Using the data:
-     double x[3];   // point to find
-
-     uint index = hash_index_3(&data, x);
-     uint i, b = data.offset[index], e = data.offset[index+1];
-
-     // point may be in elements
-     //   data.offset[b], data.offset[b+1], ... , data.offset[e-1]
-     //
-     // list has maximum size data.max (e.g., e-b <= data.max)
-
-     for(i=b; i!=e; ++i) {
-       uint el = data.offset[i];
-       ...
-     }
-
-   When done:
-     hash_free(&data);
-
-  --------------------------------------------------------------------------*/
-
-struct hash_data {
-  uint hash_n;
-  struct dbl_range bnd[D];
-  double fac[D];
-  uint *offset;
-  uint max;
-};
-
-static uint hash_index(const struct hash_data *p, const double x[D])
-{
-  const uint n = p->hash_n;
-  return ( WHEN_3D( hash_index_aux(p->bnd[2].min,p->fac[2],n,x[2])  *n )
-                   +hash_index_aux(p->bnd[1].min,p->fac[1],n,x[1]) )*n
-                   +hash_index_aux(p->bnd[0].min,p->fac[0],n,x[0]);
-}
-
-static void hash_setfac(struct hash_data *p, const uint n)
-{
-  unsigned d;
-  p->hash_n = n;
-  for(d=0;d<D;++d) p->fac[d] = n/(p->bnd[d].max-p->bnd[d].min);
-}
-
-static struct uint_range hash_range(const struct hash_data *p, unsigned d,
-                                    const struct dbl_range r)
-{
-  struct uint_range ir;
-  const sint i0 = ifloor( (r.min - p->bnd[d].min) * p->fac[d] );
-  const uint i1 = iceil ( (r.max - p->bnd[d].min) * p->fac[d] );
-  ir.min = i0<0 ? 0 : i0;
-  ir.max = i1<p->hash_n ? i1 : p->hash_n;
-  if(ir.max==ir.min) ++ir.max;
-  return ir;
-}
-
-static uint hash_count(struct hash_data *p,
-                       const struct obbox *const obb, const uint nel,
-                       const uint n)
-{
-  uint i,count=0;
-  hash_setfac(p,n);
-  for(i=0;i<nel;++i) {
-    struct uint_range ir; uint ci; unsigned d;
-      ir=hash_range(p,0,obb[i].x[0]); ci  = ir.max-ir.min;
-    for(d=1;d<D;++d)
-      ir=hash_range(p,d,obb[i].x[d]), ci *= ir.max-ir.min;
-    count+=ci;
-  }
-  return count;
-}
-
-static uint hash_opt_size(struct hash_data *p,
-                          const struct obbox *const obb, const uint nel,
-                          const uint max_size)
-{
-  uint nl=1, nu=ceil(pow(max_size-nel,1.0/D));
-  uint size_low=2+nel;
-  while(nu-nl>1) {
-    uint nm = nl+(nu-nl)/2, nmd = nm*nm, size;
-    WHEN_3D(nmd *= nm);
-    size = nmd+1+hash_count(p,obb,nel,nm);
-    if(size<=max_size) nl=nm,size_low=size; else nu=nm;
-  }
-  hash_setfac(p,nl);
-  return size_low;
-}
-
-static void hash_bb(struct hash_data *p,
-                    const struct obbox *const obb, const uint nel)
-{
-  uint el; unsigned d;
-  struct dbl_range bnd[D];
-  if(nel) {
-    for(d=0;d<D;++d) bnd[d]=obb[0].x[d];
-    for(el=1;el<nel;++el)
-      for(d=0;d<D;++d)
-        bnd[d]=dbl_range_merge(bnd[d],obb[el].x[d]);
-    for(d=0;d<D;++d) p->bnd[d]=bnd[d];
-  } else {
-    for(d=0;d<D;++d) p->bnd[d].max=p->bnd[d].min=0;
-  }
-}
-
-static void hash_build(struct hash_data *p,
-                       const struct obbox *const obb, const uint nel,
-                       const uint max_size)
-{
-  uint i,el,size,hn,hnd,sum,max, *count;
-  hash_bb(p,obb,nel);
-  size = hash_opt_size(p,obb,nel,max_size);
-  p->offset = tmalloc(uint,size);
-  hn = p->hash_n;
-  hnd = hn*hn; WHEN_3D(hnd*=hn);
-  count = tcalloc(uint,hnd);
-  for(el=0;el<nel;++el) {
-    unsigned d; struct uint_range ir[D];
-    for(d=0;d<D;++d) ir[d]=hash_range(p,d,obb[el].x[d]);
-    #define FOR_LOOP() do { uint ii,j; WHEN_3D(uint k;) \
-      WHEN_3D(for(k=ir[2].min;k<ir[2].max;++k)) \
-              for(j=ir[1].min;j<ir[1].max;++j) \
-              for(ii=ir[0].min;ii<ir[0].max;++ii) \
-                ++count[(WHEN_3D(k*hn)+j)*hn+ii]; \
-    } while(0)
-    FOR_LOOP();
-    #undef FOR_LOOP
-  }
-  sum=hnd+1, max=count[0];
-  p->offset[0]=sum;
-  for(i=0;i<hnd;++i) {
-    max = count[i]>max?count[i]:max;
-    sum += count[i];
-    p->offset[i+1] = sum;
-  }
-  p->max = max;
-  for(el=0;el<nel;++el) {
-    unsigned d; struct uint_range ir[D];
-    for(d=0;d<D;++d) ir[d]=hash_range(p,d,obb[el].x[d]);
-    #define FOR_LOOP() do { uint ii,j; WHEN_3D(uint k;) \
-      WHEN_3D(for(k=ir[2].min;k<ir[2].max;++k)) \
-              for(j=ir[1].min;j<ir[1].max;++j) \
-              for(ii=ir[0].min;ii<ir[0].max;++ii) { \
-                uint index = (WHEN_3D(k*hn)+j)*hn+ii; \
-                p->offset[p->offset[index+1]-count[index]]=el; \
-                --count[index]; \
-              } \
-    } while(0)
-    FOR_LOOP();
-    #undef FOR_LOOP
-  }
-  free(count);
-}
-
-static void hash_free(struct hash_data *p) { free(p->offset); }
-
-struct findpts_local_data {
-  unsigned ntot;
-  const double *elx[D];
-  struct obbox *obb;
-  struct hash_data hd;
-  struct findpts_el_data fed;
-  double tol;
-};
-
-void findpts_local_setup(struct findpts_local_data *const fd,
-                         const double *const elx[D],
-                         const unsigned n[D], const uint nel,
-                         const unsigned m[D], const double bbox_tol,
-                         const uint max_hash_size,
-                         const unsigned npt_max, const double newt_tol)
-{
-  unsigned d;
-  unsigned ntot=n[0]; for(d=1;d<D;++d) ntot*=n[d];
-  fd->ntot = ntot;
-  for(d=0;d<D;++d) fd->elx[d]=elx[d];
-  fd->obb=tmalloc(struct obbox,nel);
-  obbox_calc(fd->obb,elx,n,nel,m,bbox_tol);
-  hash_build(&fd->hd,fd->obb,nel,max_hash_size);
-  findpts_el_setup(&fd->fed,n,npt_max);
-  fd->tol = newt_tol;
-}
-
-void findpts_local_free(struct findpts_local_data *const fd)
-{
-  findpts_el_free(&fd->fed);
-  hash_free(&fd->hd);
-  free(fd->obb);
-}
-
-static void map_points_to_els(
-  struct array *const map,
-        uint   *const  code_base   , const unsigned  code_stride   ,
-  const double *const     x_base[D], const unsigned     x_stride[D],
-  const uint npt, const struct findpts_local_data *const fd,
-  buffer *buf)
-{
-  uint index;
-  const double *xp[D]; uint *code=code_base;
-  unsigned d; for(d=0;d<D;++d) xp[d]=x_base[d];
-  array_init(struct index_el,map,npt+(npt>>2)+1);
-  for(index=0;index<npt;++index) {
-    double x[D]; for(d=0;d<D;++d) x[d]=*xp[d];
-    { const uint hi = hash_index(&fd->hd,x);
-      const uint       *elp = fd->hd.offset + fd->hd.offset[hi  ],
-                 *const ele = fd->hd.offset + fd->hd.offset[hi+1];
-      *code = CODE_NOT_FOUND;
-      for(; elp!=ele; ++elp) {
-        const uint el = *elp;
-        if(obbox_test(&fd->obb[el],x)>=0) {
-          struct index_el *const p =
-            array_reserve(struct index_el,map,map->n+1);
-          p[map->n].index = index;
-          p[map->n].el = el;
-          ++map->n;
-        }
-      }
-    }
-    for(d=0;d<D;++d)
-    xp[d] = (const double*)((const char*)xp[d]+   x_stride[d]);
-    code  =         (uint*)(      (char*)code +code_stride   );
-  }
-  /* group by element */
-  sarray_sort(struct index_el,map->ptr,map->n, el,0, buf);
-  /* add sentinel */
-  {
-    struct index_el *const p =
-      array_reserve(struct index_el,map,map->n+1);
-    p[map->n].el = -(uint)1;
-  }
-}
-
-#define   AT(T,var,i)   \
-        (T*)(      (char*)var##_base   +(i)*var##_stride   )
-#define  CAT(T,var,i) \
-  (const T*)((const char*)var##_base   +(i)*var##_stride   )
-#define CATD(T,var,i,d) \
-  (const T*)((const char*)var##_base[d]+(i)*var##_stride[d])
-
-void findpts_local(
-        uint   *const  code_base   , const unsigned  code_stride   ,
-        uint   *const    el_base   , const unsigned    el_stride   ,
-        double *const     r_base   , const unsigned     r_stride   ,
-        double *const dist2_base   , const unsigned dist2_stride   ,
-  const double *const     x_base[D], const unsigned     x_stride[D],
-  const uint npt, struct findpts_local_data *const fd,
-  buffer *buf)
-{
-  struct findpts_el_data *const fed = &fd->fed;
-  struct findpts_el_pt *const fpt = findpts_el_points(fed);
-  struct array map; /* point -> element map */
-  map_points_to_els(&map, code_base,code_stride, x_base,x_stride, npt, fd, buf);
-  {
-    const unsigned npt_max = fd->fed.npt_max;
-    const struct index_el *p, *const pe = (struct index_el *)map.ptr+map.n;
-    for(p=map.ptr;p!=pe;) {
-      const uint el = p->el, el_off=el*fd->ntot;
-      const double *elx[D];
-      unsigned d;
-      for(d=0;d<D;++d) elx[d]=fd->elx[d]+el_off;
-      findpts_el_start(fed,elx);
-      do {
-        const struct index_el *q;
-        unsigned i;
-        for(i=0,q=p;i<npt_max && q->el==el;++q) {
-          uint *code = AT(uint,code,q->index);
-          if(*code==CODE_INTERNAL) continue;
-          for(d=0;d<D;++d) fpt[i].x[d]=*CATD(double,x,q->index,d);
-          ++i;
-        }
-        findpts_el(fed,i,fd->tol);
-        for(i=0,q=p;i<npt_max && q->el==el;++q) {
-          const uint index=q->index;
-          uint *code = AT(uint,code,index);
-          double *dist2 = AT(double,dist2,index);
-          if(*code==CODE_INTERNAL) continue;
-          if(*code==CODE_NOT_FOUND
-             || fpt[i].flags==(1u<<(2*D)) /* converged, no constraints */
-             || fpt[i].dist2<*dist2) {
-            double *r = AT(double,r,index);
-            uint *eli = AT(uint,el,index);
-            *eli = el;
-            *code = fpt[i].flags==(1u<<(2*D)) ? CODE_INTERNAL : CODE_BORDER;
-            *dist2 = fpt[i].dist2;
-            for(d=0;d<D;++d) r[d]=fpt[i].r[d];
-          }
-          ++i;
-        }
-        p=q;
-      } while(p->el==el);
-    }
-  }
-  array_free(&map);
-}
-
-/* assumes points are already grouped by elements */
-void findpts_local_eval(
-        double *const out_base, const unsigned out_stride,
-  const uint   *const  el_base, const unsigned  el_stride,
-  const double *const   r_base, const unsigned   r_stride,
-  const uint npt,
-  const double *const in, struct findpts_local_data *const fd)
-{
-  struct findpts_el_data *const fed = &fd->fed;
-  const unsigned npt_max = fed->npt_max;
-  uint p;
-  for(p=0;p<npt;) {
-    const uint el = *CAT(uint,el,p);
-    const double *const in_el = in+el*fd->ntot;
-    do {
-      unsigned i; uint q;
-      for(i=0,q=p;i<npt_max && q<npt && *CAT(uint,el,q)==el;++q) ++i;
-      findpts_el_eval( AT(double,out,p),out_stride,
-                      CAT(double,  r,p),  r_stride, i,
-                      in_el,fed);
-      p=q;
-    } while(p<npt && *CAT(uint,el,p)==el);
-  }
-}
-
-#undef CATD
-#undef CAT
-#undef AT
-
-#undef findpts_local_eval
-#undef findpts_local
-#undef findpts_local_free
-#undef findpts_local_setup
-#undef map_points_to_els
-#undef findpts_local_data
-#undef findpts_el_points
-#undef findpts_el_start
-#undef findpts_el_eval
-#undef findpts_el
-#undef findpts_el_free
-#undef findpts_el_setup
-#undef findpts_el_data
-#undef hash_free
-#undef hash_build
-#undef hash_bb
-#undef hash_opt_size
-#undef hash_count
-#undef hash_range
-#undef hash_setfac
-#undef hash_index
-#undef hash_data
-#undef obbox_test
-#undef obbox_calc
-#undef obbox
-
diff --git a/3rdParty/gslib/src/gen_poly_imp.c b/3rdParty/gslib/src/gen_poly_imp.c
deleted file mode 100644
index 5fd7894a3..000000000
--- a/3rdParty/gslib/src/gen_poly_imp.c
+++ /dev/null
@@ -1,226 +0,0 @@
-#include <math.h>
-#include <stdio.h>
-#include <gmp.h>
-
-#define PREC_BITS 256
-#define DIGITS 50
-
-#define GLL_LAG_FIX_MAX 24
-
-#if 1
-#  define STATIC "static "
-#else
-#  define STATIC ""
-#endif
-
-
-#define PI 3.1415926535897932384626433832795028841971693993751058209749445923
-
-#define DECLARE_1VAR(a)        static int init=0; static mpf_t a; \
-                               if(!init) init=1, mpf_init(a)
-#define DECLARE_2VARS(a,b)     static int init=0; static mpf_t a,b; \
-                               if(!init) init=1, mpf_init(a), mpf_init(b)
-#define DECLARE_3VARS(a,b,c)   static int init=0; static mpf_t a,b,c; \
-                               if(!init) init=1, mpf_init(a), mpf_init(b), \
-                                                 mpf_init(c)
-#define DECLARE_4VARS(a,b,c,d) static int init=0; static mpf_t a,b,c,d; \
-                               if(!init) init=1, mpf_init(a), mpf_init(b), \
-                                                 mpf_init(c), mpf_init(d)
-                                                 
-static int is_small(const mpf_t x, const mpf_t y) {
-  DECLARE_2VARS(xa,ya);
-  mpf_abs(xa,x);
-  mpf_abs(ya,y);
-  mpf_div_2exp(ya,ya,PREC_BITS-mp_bits_per_limb);
-  return mpf_cmp(xa,ya) < 0;
-}
-
-typedef void fun_3term(mpf_t Pn, int n, const mpf_t x);
-
-#define DECLARE_THREE_TERM(name, i0_init, init_Ps, a_ip1,a_i,a_im1) \
-static void name(mpf_t Pn, int n, const mpf_t x) \
-{ \
-  int i, i0_init; \
-  DECLARE_4VARS(a,b,P_im1,P_i); \
-  init_Ps; \
-  for(i=i0+1; i<n; ++i) { \
-    mpf_mul(a, x,P_i); \
-    mpf_mul_ui(a, a,a_i); \
-    mpf_mul_ui(b, P_im1,a_im1); \
-    mpf_sub(a, a,b); \
-    mpf_swap(P_im1, P_i); \
-    mpf_div_ui(P_i, a,a_ip1); \
-  } \
-  mpf_set(Pn, n>i0?P_i:P_im1); \
-}
-
-DECLARE_THREE_TERM(legendre,    i0=0,(mpf_set_ui(P_im1,1),mpf_set   (P_i,x)),
-                   i+1, 2*i+1, i  )
-DECLARE_THREE_TERM(legendre_d1, i0=0,(mpf_set_ui(P_im1,0),mpf_set_ui(P_i,1)),
-                   i  , 2*i+1, i+1)
-DECLARE_THREE_TERM(legendre_d2, i0=1,(mpf_set_ui(P_im1,0),mpf_set_ui(P_i,3)),
-                   i-1, 2*i+1, i+2)
-
-static void newton(mpf_t x, double seed,
-                   fun_3term *fun, fun_3term *der, int n)
-{
-  DECLARE_3VARS(ox,f,df);
-  mpf_set_d(x, seed);
-  do {
-    mpf_set(ox, x);
-    fun(f, n,x), der(df, n,x), mpf_div(f, f,df), mpf_sub(x, x,f);
-  } while(!is_small(f,x));
-  fun( f, n,x), der(df, n,x), mpf_div(f, f,df), mpf_sub(x, x,f);
-}
-
-static void gauss_node(mpf_t z, int n, int i) {
-  if( (n&1) && i==n/2 ) mpf_set_ui(z,0);
-  else newton(z, cos( (2*n-2*i-1)*(PI/2)/n ), legendre,legendre_d1,n);
-}
-
-static void lobatto_node(mpf_t z, int n, int i) {
-  if( (n&1) && i==n/2 ) mpf_set_ui(z,0);
-  else if(i==0)   mpf_set_d(z,-(double)1);
-  else if(i==n-1) mpf_set_ui(z,1);
-  else newton(z, cos( (n-1-i)*PI/(n-1) ), legendre_d1,legendre_d2,n-1);
-}
-
-#define PRINT_LIST(i, i0,nline,n, printi,sep,sepline) \
-  do { \
-    int i; \
-    for(i=i0;i<n;++i) { \
-      printi; \
-      printf("%s",i==n-1?"":((i-i0)%nline==nline-1?sepline:sep)); \
-    } \
-  } while(0)
-
-static void print_gll_lag_fix(int n)
-{
-  int i;
-  DECLARE_1VAR(z);
-  if(n>3) {
-    printf("static const double gllz_%02d[%2d] = {\n  ",n,n/2-1);
-    for(i=1;i<=n/2-1;++i) {
-      lobatto_node(z, n,n-1-i);
-      if(i!=1) printf(",\n  ");
-      gmp_printf("%.*Fg",DIGITS,z);
-    }
-    puts("\n};\n");
-  }
-  printf(STATIC "void gll_lag_%02d(double *restrict p, double *restrict w,\n"
-           "                       unsigned n, int d, double xh)\n{\n",n);
-  printf("  const double x = xh*2;\n");
-  #define PRINT_D(i) do { \
-    printf("d%02d=x",i); \
-    if(2*i+1==n)    printf("              "); \
-    else if(i==0)   printf("+2            "); \
-    else if(i==n-1) printf("-2            "); \
-    else if(i<n/2)  printf("+2*gllz_%02d[%2d]",n,i-1); \
-    else            printf("-2*gllz_%02d[%2d]",n,n-2-i); \
-  } while(0)
-  printf("%s",                            "  const double ");
-  PRINT_LIST(i, 0,3,n, PRINT_D(i),",",",\n               ");
-  #undef PRINT_D
-  #define PRINT_U0(i) (i==0  ?printf("    1"):printf("u0_%02d",i))
-  #define PRINT_V0(i) (i==n-1?printf("    1"):printf("v0_%02d",i))
-  #define PRINT_U1(i) (i<=1  ?printf("    %d",i      ):printf("u1_%02d",i))
-  #define PRINT_V1(i) (i>=n-2?printf("    %d",n-1-(i)):printf("v1_%02d",i))
-  #define PRINT_U2(i) (i<=1  ?printf("    0"): \
-                      (i==2  ?printf("    2"):printf("u2_%02d",i)))
-  #define PRINT_V2(i) (i>=n-2?printf("    0"): \
-                      (i==n-3?printf("    2"):printf("v2_%02d",i)))
-  printf("%s",";\n  const double ");
-  PRINT_LIST(i, 1,3,n,
-    (PRINT_U0(i),putchar('='),PRINT_U0(i-1),printf("*d%02d",i-1)),
-    ",",",\n               ");
-  printf("%s",";\n  const double ");
-  PRINT_LIST(i, 1,3,n,
-    (PRINT_V0(n-1-i),putchar('='),printf("d%02d*",n-i),PRINT_V0(n-i)),
-    ",",",\n               ");
-  printf("%s",";\n  ");
-  PRINT_LIST(i, 0,3,n, 
-    (printf("p[%2d]=w[%2d]*",i,i),PRINT_U0(i),putchar('*'),
-     PRINT_V0(i)),"; ",";\n  ");
-  puts(";\n  if(d>0) {");
-  if(n>2) {
-    printf("%s","    const double ");
-    PRINT_LIST(i, 2,2,n,
-      (PRINT_U1(i),putchar('='),PRINT_U1(i-1),printf("*d%02d",i-1),
-       putchar('+'),PRINT_U0(i-1)),
-      ",",",\n                 ");
-    printf("%s",";\n    const double ");
-    PRINT_LIST(i, 2,2,n,
-      (PRINT_V1(n-1-i),putchar('='),printf("d%02d*",n-i),PRINT_V1(n-i),
-       putchar('+'),PRINT_V0(n-i)),
-      ",",",\n                 ");
-    puts(";");
-  }
-  for(i=0;i<n;++i) {
-    printf("    p[%d+%2d]=2*w[%2d]*(",n,i,i);
-    if(i==0)        printf("                  "),PRINT_V1(0);
-    else if(i==n-1) PRINT_U1(i),printf("                  ");
-    else PRINT_U1(i),putchar('*'),PRINT_V0(i),putchar('+'),
-         PRINT_U0(i),putchar('*'),PRINT_V1(i);
-    puts(");");
-  }
-  puts("    if(d>1) {");
-  if(n>3) {
-    printf("%s","      const double ");
-    PRINT_LIST(i, 3,2,n,
-      (PRINT_U2(i),putchar('='),PRINT_U2(i-1),printf("*d%02d",i-1),
-       printf("+2*"),PRINT_U1(i-1)),
-      ",",",\n                   ");
-    printf("%s",";\n      const double ");
-    PRINT_LIST(i, 3,2,n,
-      (PRINT_V2(n-1-i),putchar('='),printf("d%02d*",n-i),PRINT_V2(n-i),
-       printf("+2*"),PRINT_V1(n-i)),
-      ",",",\n                   ");
-    puts(";");
-  }  
-  if(n<3) for(i=0;i<n;++i) printf("      p[2*%d+%2d]=0;\n",n,i);
-  else for(i=0;i<n;++i) {
-      printf("      p[2*%d+%2d]=4*w[%2d]*(",n,i,i);
-      if(i>1)
-        PRINT_U2(i),putchar('*'),PRINT_V0(i);
-      else printf("           ");
-      if(i>0 && i<n-1)
-        printf("+2*"),PRINT_U1(i),putchar('*'),PRINT_V1(i);
-      else printf("              ");
-      if(i<n-2)
-        putchar('+'),PRINT_U0(i),putchar('*'),PRINT_V2(i);
-      else printf("            ");
-      puts(");");
-  }
-  #undef PRINT_U0
-  #undef PRINT_V0
-  #undef PRINT_U1
-  #undef PRINT_V1
-  #undef PRINT_U2
-  #undef PRINT_V2
-  puts("    }\n  }\n}");
-}
-
-
-int main()
-{
-  int n;
-  mpf_set_default_prec(PREC_BITS);
-  puts("/* generated by gen_poly_imp.c */\n");
-  printf("#define GLL_LAG_FIX_MAX %d\n\n",GLL_LAG_FIX_MAX);
-  /*puts("typedef void gll_lag_fun(double *p, int d, int n, double x);\n");*/
-  for(n=2;n<=GLL_LAG_FIX_MAX;++n)
-      print_gll_lag_fix(n), puts("");
-  printf(STATIC "const double *const gllz_table[%d] = {\n  ",
-    GLL_LAG_FIX_MAX-3);
-  PRINT_LIST(i, 4,8,(GLL_LAG_FIX_MAX+1),
-    printf("gllz_%02d",i), ", ",",\n  ");
-  puts("\n};");
-  puts("");
-  printf(STATIC "lagrange_fun *const gll_lag_table[%d] = {\n  ",
-    GLL_LAG_FIX_MAX-1);
-  PRINT_LIST(i, 2,6,(GLL_LAG_FIX_MAX+1),
-    printf("&gll_lag_%02d",i), ", ",",\n  ");
-  puts("\n};");
-  puts("");
-  return 0;
-}
diff --git a/3rdParty/gslib/src/gs.c b/3rdParty/gslib/src/gs.c
deleted file mode 100644
index 26fafdc33..000000000
--- a/3rdParty/gslib/src/gs.c
+++ /dev/null
@@ -1,1651 +0,0 @@
-#include <stdio.h>
-#include <stddef.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include "c99.h"
-#include "name.h"
-#include "fail.h"
-#include "types.h"
-
-#define gs_op gs_op_t   /* fix conflict with fortran */
-
-#include "gs_defs.h"
-#include "gs_local.h"
-#include "comm.h"
-#include "mem.h"
-#include "sort.h"
-#include "crystal.h"
-#include "sarray_sort.h"
-#include "sarray_transfer.h"
-
-#define gs         PREFIXED_NAME(gs       )
-#define gs_vec     PREFIXED_NAME(gs_vec   )
-#define gs_many    PREFIXED_NAME(gs_many  )
-#define igs        PREFIXED_NAME(igs      )
-#define igs_vec    PREFIXED_NAME(igs_vec  )
-#define igs_many   PREFIXED_NAME(igs_many )
-#define gs_wait    PREFIXED_NAME(gs_wait  )
-#define gs_setup   PREFIXED_NAME(gs_setup )
-#define gs_free    PREFIXED_NAME(gs_free  )
-#define gs_unique  PREFIXED_NAME(gs_unique)
-#define gs_hf2c    PREFIXED_NAME(gs_hf2c  )
-
-GS_DEFINE_DOM_SIZES()
-
-typedef enum { mode_plain, mode_vec, mode_many,
-               mode_dry_run } gs_mode;
-
-static buffer static_buffer = null_buffer;
-
-static void gather_noop(
-  void *out, const void *in, const unsigned vn,
-  const uint *map, gs_dom dom, gs_op op)
-{}
-
-static void scatter_noop(
-  void *out, const void *in, const unsigned vn,
-  const uint *map, gs_dom dom)
-{}
-
-static void init_noop(
-  void *out, const unsigned vn,
-  const uint *map, gs_dom dom, gs_op op)
-{}
-
-/*------------------------------------------------------------------------------
-  Topology Discovery
-------------------------------------------------------------------------------*/
-
-struct gs_topology {
-  ulong total_shared; /* number of globally unique shared ids */
-  struct array nz; /* array of nonzero_id's, grouped by id,
-                      sorted by primary index, then flag, then index */
-  struct array sh; /* array of shared_id's, arbitrary ordering */
-  struct array pr; /* array of primary_shared_id's */
-};
-
-static void gs_topology_free(struct gs_topology *top)
-{
-  array_free(&top->pr);
-  array_free(&top->sh);
-  array_free(&top->nz);
-}
-
-/************** Local topology **************/
-
-/* nonzero_ids    (local part)
-
-   Creates an array of s_nonzeros, one per nonzero in user id array. The
-   output array is grouped by id. Within each group, non-flagged entries come
-   first; otherwise the entries within the group are sorted by the index into
-   the user id array. The first index in each group is the primary index, and
-   is stored along with each entry. The groups themselves are ordered in
-   increasing order of the primary index associated with the group (as opposed
-   to the user id). */
-
-struct nonzero_id {
-  ulong id; uint i, flag, primary;
-};
-
-static void nonzero_ids(struct array *nz,
-                        const slong *id, const uint n, buffer *buf)
-{
-  ulong last_id = ULONG_MAX;
-  uint i, primary = UINT_MAX;
-  struct nonzero_id *row, *end;
-  array_init(struct nonzero_id,nz,n), end=row=nz->ptr;
-  for(i=0;i<n;++i) {
-    slong id_i = id[i], abs_id = iabsl(id_i);
-    if(id_i==0) continue;
-    end->i = i;
-    end->id = abs_id;
-    end->flag = id_i!=abs_id;
-    ++end;
-  }
-  nz->n = end-row;
-  array_resize(struct nonzero_id,nz,nz->n);
-  sarray_sort_2(struct nonzero_id,nz->ptr,nz->n, id,1, flag,0, buf);
-  for(row=nz->ptr,end=row+nz->n;row!=end;++row) {
-    ulong this_id = row->id;
-    if(this_id!=last_id) primary = row->i;
-    row->primary = primary;
-    last_id = this_id;
-  }
-  sarray_sort(struct nonzero_id,nz->ptr,nz->n, primary,0, buf);
-}
-
-/************** Global topology **************/
-
-/* construct list of all unique id's on this proc */
-struct unique_id { ulong id; uint work_proc, src_if; };
-static void unique_ids(struct array *un, const struct array *nz, const uint np)
-{
-  struct unique_id *un_row;
-  const struct nonzero_id *nz_row, *nz_end;
-  array_init(struct unique_id,un,nz->n), un_row=un->ptr;
-  for(nz_row=nz->ptr,nz_end=nz_row+nz->n;nz_row!=nz_end;++nz_row) {
-    if(nz_row->i != nz_row->primary) continue;
-    un_row->id = nz_row->id;
-    un_row->work_proc = nz_row->id%np;
-    un_row->src_if = nz_row->flag ? ~nz_row->i : nz_row->i;
-    ++un_row;
-  }
-  un->n = un_row - (struct unique_id*)un->ptr;
-}
-
-/* shared_ids    (global part)
-
-   Creates an array of shared_id's from an array of nonzero_id's. Each entry
-   in the output identifies one id shared with one other processor p.
-   Note: two procs share an id only when at least one of them has it unflagged.
-   The primary index is i locally and ri remotely. Bit 1 of flags indicates
-   the local flag, bit 2 indicates the remote flag. The output has no
-   particular ordering.
-
-   Also creates an array of primary_shared_id's, one for each shared id.
-   This struct includes ord, a global rank of the id (arbitrary, but unique). */
-
-#define FLAGS_LOCAL  1
-#define FLAGS_REMOTE 2
-
-/* i  : local primary index
-   p  : remote proc
-   ri : remote primary index
-   bi : buffer index (set and used during pw setup) */
-struct shared_id {
-  ulong id; uint i, p, ri, bi; unsigned flags;
-};
-
-struct primary_shared_id {
-  ulong id, ord; uint i; unsigned flag;
-};
-
-
-
-struct shared_id_work { ulong id,ord; uint p1, p2, i1f, i2f; };
-static void shared_ids_aux(struct array *sh, struct array *pr, uint pr_n,
-                           struct array *wa, buffer *buf)
-{
-  const struct shared_id_work *w, *we;
-  struct shared_id *s;
-  struct primary_shared_id *p;
-  ulong last_id = ULONG_MAX;
-  /* translate work array to output arrays */
-  sarray_sort(struct shared_id_work,wa->ptr,wa->n, id,1, buf);
-  array_init(struct shared_id,sh,wa->n), sh->n=wa->n, s=sh->ptr;
-  array_init(struct primary_shared_id,pr,pr_n), p=pr->ptr;
-  for(w=wa->ptr,we=w+wa->n;w!=we;++w) {
-    uint i1f = w->i1f, i2f = w->i2f;
-    uint i1 = ~i1f<i1f?~i1f:i1f, i2 = ~i2f<i2f?~i2f:i2f;
-    s->id=w->id, s->i=i1, s->p=w->p2, s->ri=i2;
-    s->flags = ((i2f^i2)&FLAGS_REMOTE) | ((i1f^i1)&FLAGS_LOCAL);
-    ++s;
-    if(w->id!=last_id) {
-      last_id=w->id;
-      p->id=last_id, p->ord=w->ord, p->i=i1, p->flag=(i1f^i1)&FLAGS_LOCAL;
-      ++p;
-    }
-  }
-  pr->n = p-(struct primary_shared_id*)pr->ptr;
-  sarray_sort(struct primary_shared_id,pr->ptr,pr->n, i,0, buf);
-}
-
-static ulong shared_ids(struct array *sh, struct array *pr,
-                        const struct array *nz, struct crystal *cr)
-{
-  struct array un; struct unique_id *un_row, *un_end, *other;
-  ulong last_id = ULONG_MAX;
-  ulong ordinal[2], n_shared=0, scan_buf[2];
-  struct array wa; struct shared_id_work *w;
-  uint n_unique;
-  /* construct list of all unique id's on this proc */
-  unique_ids(&un,nz,cr->comm.np);
-  n_unique = un.n;
-  /* transfer list to work procs */
-  sarray_transfer(struct unique_id,&un, work_proc,1, cr);
-  /* group by id, put flagged entries after unflagged (within each group) */
-  sarray_sort_2(struct unique_id,un.ptr,un.n, id,1, src_if,0, &cr->data);
-  /* count shared id's */
-  for(un_row=un.ptr,un_end=un_row+un.n;un_row!=un_end;++un_row) {
-    ulong id = un_row->id;
-    if(~un_row->src_if<un_row->src_if) continue;
-    if(id==last_id) continue;
-    other=un_row+1;
-    if(other!=un_end&&other->id==id) last_id=id, ++n_shared;
-  }
-  comm_scan(ordinal, &cr->comm,gs_slong,gs_add, &n_shared,1, scan_buf);
-  /* there are ordinal[1] globally shared unique ids;
-           and ordinal[0] of those are seen by work procs of lower rank;
-     i.e., this work processor sees the range ordinal[0] + (0,n_shared-1) */
-  /* construct list of shared ids */
-  last_id = ULONG_MAX;
-  array_init(struct shared_id_work,&wa,un.n), wa.n=0, w=wa.ptr;
-  for(un_row=un.ptr,un_end=un_row+un.n;un_row!=un_end;++un_row) {
-    ulong id = un_row->id;
-    uint p1 = un_row->work_proc, i1f = un_row->src_if;
-    if(~i1f<i1f) continue;
-    for(other=un_row+1;other!=un_end&&other->id==id;++other) {
-      uint p2 = other->work_proc, i2f = other->src_if;
-      ulong ord;
-      if(id!=last_id) last_id=id, ++ordinal[0];
-      ord=ordinal[0]-1;
-      if(wa.n+2>wa.max)
-        array_reserve(struct shared_id_work,&wa,wa.n+2),
-        w=(struct shared_id_work*)wa.ptr+wa.n;
-      w->id=id, w->ord=ord, w->p1=p1, w->p2=p2, w->i1f=i1f, w->i2f=i2f, ++w;
-      w->id=id, w->ord=ord, w->p1=p2, w->p2=p1, w->i1f=i2f, w->i2f=i1f, ++w;
-      wa.n+=2;
-    }
-  }
-  /* transfer shared list to source procs */
-  sarray_transfer(struct shared_id_work,&wa, p1,0, cr);
-  /* fill output arrays from work array */
-  shared_ids_aux(sh,pr,n_unique,&wa,&cr->data);
-  array_free(&un);
-  array_free(&wa);
-  return ordinal[1];
-}
-
-static void get_topology(struct gs_topology *top,
-                         const slong *id, uint n, struct crystal *cr)
-{
-  nonzero_ids(&top->nz,id,n,&cr->data);
-  top->total_shared = shared_ids(&top->sh,&top->pr, &top->nz,cr);
-}
-
-static void make_topology_unique(struct gs_topology *top, slong *id,
-                                 uint pid, buffer *buf)
-{
-  struct array *const nz=&top->nz, *const sh=&top->sh, *const pr=&top->pr;
-  struct nonzero_id *pnz;
-  struct shared_id *pb, *pe, *e, *out;
-  struct primary_shared_id *q;
-
-  /* flag local non-primaries */
-  sarray_sort(struct nonzero_id,nz->ptr,nz->n, i,0, buf);
-  if(id) {
-    struct nonzero_id *p,*ee;
-    for(p=nz->ptr,ee=p+nz->n;p!=ee;++p)
-      if(p->i != p->primary) id[p->i]=-(slong)p->id,p->flag=1;
-  } else {
-    struct nonzero_id *p,*ee;
-    for(p=nz->ptr,ee=p+nz->n;p!=ee;++p)
-      if(p->i != p->primary) p->flag=1;
-  }
-  sarray_sort(struct nonzero_id,nz->ptr,nz->n, primary,0, buf);
-
-  /* assign owner among shared primaries */
-
-  /* create sentinel with i = -1 */
-  array_reserve(struct shared_id,sh,sh->n+1);
-  ((struct shared_id*)sh->ptr)[sh->n].i = UINT_MAX;
-  /* in the sorted list of procs sharing a given id,
-     the owner is chosen to be the j^th unflagged proc,
-     where j = id mod (length of list) */
-  sarray_sort_2(struct shared_id,sh->ptr,sh->n, i,0, p,0, buf);
-  out=sh->ptr; pnz=top->nz.ptr;
-  for(pb=sh->ptr,e=pb+sh->n;pb!=e;pb=pe) {
-    uint i = pb->i, lt=0,gt=0, owner; struct shared_id *p;
-    while(pnz->i!=i) ++pnz;
-    /* note: current proc not in list */
-    for(pe=pb; pe->i==i && pe->p<pid; ++pe) if(!(pe->flags&FLAGS_REMOTE)) ++lt;
-    for(     ; pe->i==i             ; ++pe) if(!(pe->flags&FLAGS_REMOTE)) ++gt;
-    if(!(pb->flags&FLAGS_LOCAL)) {
-      owner = pb->id%(lt+gt+1);
-      if(owner==lt) goto make_sh_unique_mine;
-      if(owner>lt) --owner;
-    } else
-      owner = pb->id%(lt+gt);
-    /* we don't own pb->id */
-    if(id) id[i] = -(slong)pb->id;
-    pnz->flag=1;
-    /* we only share this id with the owner now; remove the other entries */
-    for(p=pb; p!=pe; ++p) if(!(p->flags&FLAGS_REMOTE) && !(owner--)) break;
-    if(p!=pe) *out=*p, out->flags=FLAGS_LOCAL, ++out;
-    continue;
-  make_sh_unique_mine:
-    /* we own pb->id */
-    if(out==pb) { out=pe; for(p=pb; p!=pe; ++p) p->flags=FLAGS_REMOTE; }
-    else        for(p=pb; p!=pe; ++p) *out=*p,out->flags=FLAGS_REMOTE,++out;
-  }
-  sh->n = out - ((struct shared_id*)sh->ptr);
-
-  /* set primary_shared_id flags to match */
-  ((struct shared_id*)sh->ptr)[sh->n].i = UINT_MAX;
-  sarray_sort(struct shared_id,sh->ptr,sh->n, id,1, buf);
-  sarray_sort(struct primary_shared_id,pr->ptr,pr->n, id,1, buf);
-  q=pr->ptr;
-  for(pb=sh->ptr,e=pb+sh->n;pb!=e;pb=pe) {
-    uint i=pb->i;
-    pe=pb; while(pe->i==i) ++pe;
-    if(q->id!=pb->id) printf("FAIL!!!\n");
-    q->flag=pb->flags&FLAGS_LOCAL;
-    ++q;
-  }
-}
-
-/*------------------------------------------------------------------------------
-  Local setup
-------------------------------------------------------------------------------*/
-
-/* assumes nz is sorted by primary, then flag, then index */
-static const uint *local_map(const struct array *nz, const int ignore_flagged,
-                             uint *mem_size)
-{
-  uint *map, *p, count = 1;
-  const struct nonzero_id *row, *other, *end;
-#define DO_COUNT(cond) do \
-    for(row=nz->ptr,end=row+nz->n;row!=end;) {                     \
-      ulong row_id = row->id; int any=0;                           \
-      for(other=row+1;other!=end&&other->id==row_id&&cond;++other) \
-        any=2, ++count;                                            \
-      count+=any, row=other;                                       \
-    } while(0)
-  if(ignore_flagged) DO_COUNT(other->flag==0); else DO_COUNT(1);
-#undef DO_COUNT
-  p = map = tmalloc(uint,count); *mem_size += count*sizeof(uint);
-#define DO_SET(cond) do \
-    for(row=nz->ptr,end=row+nz->n;row!=end;) {                     \
-      ulong row_id = row->id; int any=0;                           \
-      *p++ = row->i;                                               \
-      for(other=row+1;other!=end&&other->id==row_id&&cond;++other) \
-        any=1, *p++ = other->i;                                    \
-      if(any) *p++ = UINT_MAX; else --p;                           \
-      row=other;                                                   \
-    } while(0)
-  if(ignore_flagged) DO_SET(other->flag==0); else DO_SET(1);
-#undef DO_SET
-  *p = UINT_MAX;
-  return map;
-}
-
-static const uint *flagged_primaries_map(const struct array *nz, uint *mem_size)
-{
-  uint *map, *p, count=1;
-  const struct nonzero_id *row, *end;
-  for(row=nz->ptr,end=row+nz->n;row!=end;++row)
-    if(row->i==row->primary && row->flag==1) ++count;
-  p = map = tmalloc(uint,count); *mem_size += count*sizeof(uint);
-  for(row=nz->ptr,end=row+nz->n;row!=end;++row)
-    if(row->i==row->primary && row->flag==1) *p++ = row->i;
-  *p = UINT_MAX;
-  return map;
-}
-
-/*------------------------------------------------------------------------------
-  Remote execution and setup
-------------------------------------------------------------------------------*/
-
-typedef void exec_fun(
-  void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op,
-  unsigned transpose, const void *execdata, const struct comm *comm, char *buf);
-typedef void fin_fun(void *data);
-
-struct gs_remote {
-  uint buffer_size, mem_size;
-  void *data;
-  exec_fun *exec;
-  exec_fun *exec_irecv;
-  exec_fun *exec_isend;
-  exec_fun *exec_wait;
-  fin_fun *fin;
-};
-
-typedef void setup_fun(struct gs_remote *r, struct gs_topology *top,
-                       const struct comm *comm, buffer *buf);
-
-/*------------------------------------------------------------------------------
-  Pairwise Execution
-------------------------------------------------------------------------------*/
-struct pw_comm_data {
-  uint n;      /* number of messages */
-  uint *p;     /* message source/dest proc */
-  uint *size;  /* size of message */
-  uint total;  /* sum of message sizes */
-};
-
-struct pw_data {
-  struct pw_comm_data comm[2];
-  const uint *map[2];
-  comm_req *req;
-  uint buffer_size;
-};
-
-static char *pw_exec_recvs(char *buf, const unsigned unit_size,
-                           const struct comm *comm,
-                           const struct pw_comm_data *c, comm_req *req)
-{
-  const uint *p, *pe, *size=c->size;
-  for(p=c->p,pe=p+c->n;p!=pe;++p) {
-    size_t len = *(size++)*unit_size;
-    comm_irecv(req++,comm,buf,len,*p,*p);
-    buf += len;
-  }
-  return buf;
-}
-
-static char *pw_exec_sends(char *buf, const unsigned unit_size,
-                           const struct comm *comm,
-                           const struct pw_comm_data *c, comm_req *req)
-{
-  const uint *p, *pe, *size=c->size;
-  for(p=c->p,pe=p+c->n;p!=pe;++p) {
-    size_t len = *(size++)*unit_size;
-    comm_isend(req++,comm,buf,len,*p,comm->id);
-    buf += len;
-  }
-  return buf;
-}
-
-static void pw_exec(
-  void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op,
-  unsigned transpose, const void *execdata, const struct comm *comm, char *buf)
-{
-  const struct pw_data *pwd = execdata;
-  static gs_scatter_fun *const scatter_to_buf[] =
-    { &gs_scatter, &gs_scatter_vec, &gs_scatter_many_to_vec, &scatter_noop };
-  static gs_gather_fun *const gather_from_buf[] =
-    { &gs_gather, &gs_gather_vec, &gs_gather_vec_to_many, &gather_noop };
-  const unsigned recv = 0^transpose, send = 1^transpose;
-  unsigned unit_size = vn*gs_dom_size[dom];
-  char *sendbuf;
-  /* post receives */
-  sendbuf = pw_exec_recvs(buf,unit_size,comm,&pwd->comm[recv],pwd->req);
-  /* fill send buffer */
-  scatter_to_buf[mode](sendbuf,data,vn,pwd->map[send],dom);
-  /* post sends */
-  pw_exec_sends(sendbuf,unit_size,comm,&pwd->comm[send],
-                &pwd->req[pwd->comm[recv].n]);
-  comm_wait(pwd->req,pwd->comm[0].n+pwd->comm[1].n);
-  /* gather using recv buffer */
-  gather_from_buf[mode](data,buf,vn,pwd->map[recv],dom,op);
-}
-
-/*------------------------------------------------------------------------------
-  Nonblocking Pairwise Execution
-------------------------------------------------------------------------------*/
-static void pw_exec_irecv(
-  void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op,
-  unsigned transpose, const void *execdata, const struct comm *comm, char *buf)
-{
-  const struct pw_data *pwd = execdata;
-  // static gs_scatter_fun *const scatter_to_buf[] =
-  //   { &gs_scatter, &gs_scatter_vec, &gs_scatter_many_to_vec, &scatter_noop };
-  // static gs_gather_fun *const gather_from_buf[] =
-  //   { &gs_gather, &gs_gather_vec, &gs_gather_vec_to_many, &gather_noop };
-  const unsigned recv = 0^transpose; // send = 1^transpose;
-  unsigned unit_size = vn*gs_dom_size[dom];
-  /* post receives */
-  (void) pw_exec_recvs(buf,unit_size,comm,&pwd->comm[recv],pwd->req);
-}
-
-static void pw_exec_isend(
-  void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op,
-  unsigned transpose, const void *execdata, const struct comm *comm, char *buf)
-{
-  const struct pw_data *pwd = execdata;
-  static gs_scatter_fun *const scatter_to_buf[] =
-    { &gs_scatter, &gs_scatter_vec, &gs_scatter_many_to_vec, &scatter_noop };
-  // static gs_gather_fun *const gather_from_buf[] =
-  //   { &gs_gather, &gs_gather_vec, &gs_gather_vec_to_many, &gather_noop };
-  const unsigned recv = 0^transpose, send = 1^transpose;
-  unsigned unit_size = vn*gs_dom_size[dom];
-
-  /* fill send buffer */
-  char *sendbuf = buf+unit_size*pwd->comm[recv].total;
-  scatter_to_buf[mode](sendbuf,data,vn,pwd->map[send],dom);
-  /* post sends */
-  pw_exec_sends(sendbuf,unit_size,comm,&pwd->comm[send],
-                &pwd->req[pwd->comm[recv].n]);
-}
-
-static void pw_exec_wait(
-  void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op,
-  unsigned transpose, const void *execdata, const struct comm *comm, char *buf)
-{
-  const struct pw_data *pwd = execdata;
-  // static gs_scatter_fun *const scatter_to_buf[] =
-  //   { &gs_scatter, &gs_scatter_vec, &gs_scatter_many_to_vec, &scatter_noop };
-  static gs_gather_fun *const gather_from_buf[] =
-    { &gs_gather, &gs_gather_vec, &gs_gather_vec_to_many, &gather_noop };
-  const unsigned recv = 0^transpose; // send = 1^transpose;
-  // unsigned unit_size = vn*gs_dom_size[dom];
-
-  comm_wait(pwd->req,pwd->comm[0].n+pwd->comm[1].n);
-  /* gather using recv buffer */
-  gather_from_buf[mode](data,buf,vn,pwd->map[recv],dom,op);
-}
-
-/*------------------------------------------------------------------------------
-  Pairwise setup
-------------------------------------------------------------------------------*/
-static uint pw_comm_setup(struct pw_comm_data *data, struct array *sh,
-                          const unsigned flags_mask, buffer *buf)
-{
-  uint n=0,count=0, lp=UINT_MAX, mem_size=0;
-  struct shared_id *s, *se;
-  /* sort by remote processor and id (a globally consistent ordering) */
-  sarray_sort_2(struct shared_id,sh->ptr,sh->n, p,0, id,1, buf);
-  /* assign index into buffer */
-  for(s=sh->ptr,se=s+sh->n;s!=se;++s) {
-    if(s->flags&flags_mask) { s->bi = UINT_MAX; continue; }
-    s->bi = count++;
-    if(s->p!=lp) lp=s->p, ++n;
-  }
-  data->n = n;
-  data->p = tmalloc(uint,2*n); mem_size+=2*n*sizeof(uint);
-  data->size = data->p + n;
-  data->total = count;
-  n = 0, lp=UINT_MAX;
-  for(s=sh->ptr,se=s+sh->n;s!=se;++s) {
-    if(s->flags&flags_mask) continue;
-    if(s->p!=lp) {
-      lp=s->p;
-      if(n!=0) data->size[n-1] = count;
-      count=0, data->p[n++]=lp;
-    }
-    ++count;
-  }
-  if(n!=0) data->size[n-1] = count;
-  return mem_size;
-}
-
-static void pw_comm_free(struct pw_comm_data *data) { free(data->p); }
-
-/* assumes that the bi field of sh is set */
-static const uint *pw_map_setup(struct array *sh, buffer *buf, uint *mem_size)
-{
-  uint count=0, *map, *p;
-  struct shared_id *s, *se;
-  sarray_sort(struct shared_id,sh->ptr,sh->n, i,0, buf);
-  /* calculate map size */
-  count=1;
-  for(s=sh->ptr,se=s+sh->n;s!=se;) {
-    uint i=s->i;
-    if(s->bi==UINT_MAX) { ++s; continue; }
-    count+=3;
-    for(++s;s!=se&&s->i==i;++s) if(s->bi!=UINT_MAX) ++count;
-  }
-  /* write map */
-  p = map = tmalloc(uint,count); *mem_size += count*sizeof(uint);
-  for(s=sh->ptr,se=s+sh->n;s!=se;) {
-    uint i=s->i;
-    if(s->bi==UINT_MAX) { ++s; continue; }
-    *p++ = i, *p++ = s->bi;
-    for(++s;s!=se&&s->i==i;++s) if(s->bi!=UINT_MAX) *p++ = s->bi;
-    *p++ = UINT_MAX;
-  }
-  *p = UINT_MAX;
-  return map;
-}
-
-static struct pw_data *pw_setup_aux(struct array *sh, buffer *buf,
-                                    uint *mem_size)
-{
-  struct pw_data *pwd = tmalloc(struct pw_data,1);
-  *mem_size = sizeof(struct pw_data);
-
-  /* default behavior: receive only remotely unflagged data */
-  *mem_size+=pw_comm_setup(&pwd->comm[0],sh, FLAGS_REMOTE, buf);
-  pwd->map[0] = pw_map_setup(sh, buf, mem_size);
-
-  /* default behavior: send only locally unflagged data */
-  *mem_size+=pw_comm_setup(&pwd->comm[1],sh, FLAGS_LOCAL, buf);
-  pwd->map[1] = pw_map_setup(sh, buf, mem_size);
-
-  pwd->req = tmalloc(comm_req,pwd->comm[0].n+pwd->comm[1].n);
-  *mem_size += (pwd->comm[0].n+pwd->comm[1].n)*sizeof(comm_req);
-  pwd->buffer_size = pwd->comm[0].total + pwd->comm[1].total;
-  return pwd;
-}
-
-static void pw_free(struct pw_data *data)
-{
-  pw_comm_free(&data->comm[0]);
-  pw_comm_free(&data->comm[1]);
-  free((uint*)data->map[0]);
-  free((uint*)data->map[1]);
-  free(data->req);
-  free(data);
-}
-
-static void pw_setup(struct gs_remote *r, struct gs_topology *top,
-                     const struct comm *comm, buffer *buf)
-{
-  struct pw_data *pwd = pw_setup_aux(&top->sh,buf, &r->mem_size);
-  r->buffer_size = pwd->buffer_size;
-  r->data = pwd;
-  r->exec = (exec_fun*)&pw_exec;
-  r->exec_irecv = (exec_fun*)&pw_exec_irecv;
-  r->exec_isend = (exec_fun*)&pw_exec_isend;
-  r->exec_wait = (exec_fun*)&pw_exec_wait;
-  r->fin = (fin_fun*)&pw_free;
-}
-
-/*------------------------------------------------------------------------------
-  Crystal-Router Execution
-------------------------------------------------------------------------------*/
-struct cr_stage {
-  const uint *scatter_map, *gather_map;
-  uint size_r, size_r1, size_r2;
-  uint size_sk, size_s, size_total;
-  uint p1, p2;
-  unsigned nrecvn;
-};
-
-struct cr_data {
-  struct cr_stage *stage[2];
-  unsigned nstages;
-  uint buffer_size, stage_buffer_size;
-};
-
-static void cr_exec(
-  void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op,
-  unsigned transpose, const void *execdata, const struct comm *comm, char *buf)
-{
-  const struct cr_data *crd = execdata;
-  static gs_scatter_fun *const scatter_user_to_buf[] =
-    { &gs_scatter, &gs_scatter_vec, &gs_scatter_many_to_vec, &scatter_noop };
-  static gs_scatter_fun *const scatter_buf_to_buf[] =
-    { &gs_scatter, &gs_scatter_vec, &gs_scatter_vec, &gs_scatter };
-  static gs_scatter_fun *const scatter_buf_to_user[] =
-    { &gs_scatter, &gs_scatter_vec, &gs_scatter_vec_to_many, &scatter_noop };
-  static gs_gather_fun *const gather_buf_to_user[] =
-    { &gs_gather, &gs_gather_vec, &gs_gather_vec_to_many, &gather_noop };
-  static gs_gather_fun *const gather_buf_to_buf[] =
-    { &gs_gather, &gs_gather_vec, &gs_gather_vec, &gs_gather };
-  const unsigned unit_size = vn*gs_dom_size[dom], nstages=crd->nstages;
-  unsigned k;
-  char *sendbuf, *buf_old, *buf_new;
-  const struct cr_stage *stage = crd->stage[transpose];
-  buf_old = buf;
-  buf_new = buf_old + unit_size*crd->stage_buffer_size;
-  /* crystal router */
-  for(k=0;k<nstages;++k) {
-    comm_req req[3];
-    if(stage[k].nrecvn)
-      comm_irecv(&req[1],comm,buf_new,unit_size*stage[k].size_r1,
-               stage[k].p1, comm->np+k);
-    if(stage[k].nrecvn==2)
-      comm_irecv(&req[2],comm,buf_new+unit_size*stage[k].size_r1,
-               unit_size*stage[k].size_r2, stage[k].p2, comm->np+k);
-    sendbuf = buf_new+unit_size*stage[k].size_r;
-    if(k==0)
-      scatter_user_to_buf[mode](sendbuf,data,vn,stage[0].scatter_map,dom);
-    else
-      scatter_buf_to_buf[mode](sendbuf,buf_old,vn,stage[k].scatter_map,dom),
-      gather_buf_to_buf [mode](sendbuf,buf_old,vn,stage[k].gather_map ,dom,op);
-
-    comm_isend(&req[0],comm,sendbuf,unit_size*stage[k].size_s,
-               stage[k].p1, comm->np+k);
-    comm_wait(&req[0],1+stage[k].nrecvn);
-    { char *t = buf_old; buf_old=buf_new; buf_new=t; }
-  }
-  scatter_buf_to_user[mode](data,buf_old,vn,stage[k].scatter_map,dom);
-  gather_buf_to_user [mode](data,buf_old,vn,stage[k].gather_map ,dom,op);
-}
-
-/*------------------------------------------------------------------------------
-  Crystal-Router setup
-------------------------------------------------------------------------------*/
-static uint cr_schedule(struct cr_data *data, const struct comm *comm)
-{
-  uint mem_size = 0;
-  const uint id = comm->id;
-  uint bl=0, n=comm->np;
-  unsigned k = 0;
-  while(n>1) {
-    uint nl = (n+1)/2, bh = bl+nl;
-    if(id<bh) n=nl; else n-=nl,bl=bh;
-    ++k;
-  }
-  data->nstages = k;
-  data->stage[0] = tmalloc(struct cr_stage,2*(k+1));
-  data->stage[1] = data->stage[0] + (k+1);
-  mem_size += 2*(k+1)*sizeof(struct cr_stage);
-  bl=0, n=comm->np, k=0;
-  while(n>1) {
-    uint nl = (n+1)/2, bh = bl+nl;
-    uint targ; unsigned recvn;
-    recvn = 1, targ = n-1-(id-bl)+bl;
-    if(id==targ) targ=bh, recvn=0;
-    if(n&1 && id==bh) recvn=2;
-    data->stage[1][k].nrecvn=data->stage[0][k].nrecvn=recvn;
-    data->stage[1][k].p1    =data->stage[0][k].p1    =targ;
-    data->stage[1][k].p2    =data->stage[0][k].p2    =comm->id-1;
-    if(id<bh) n=nl; else n-=nl,bl=bh;
-    ++k;
-  }
-  return mem_size;
-}
-
-struct crl_id {
-  ulong id; uint p, ri, si, bi, send;
-};
-
-/* assumes sh is grouped by i (e.g., sorted by i or by id) */
-static void crl_work_init(struct array *cw, struct array *sh,
-                          const unsigned send_mask, uint this_p)
-{
-  const unsigned recv_mask = send_mask^(FLAGS_REMOTE|FLAGS_LOCAL);
-  uint last_i=UINT_MAX; int added_myself=0;
-  uint cw_n = 0, cw_max = cw->max;
-  struct crl_id *w = cw->ptr;
-  struct shared_id *s, *se;
-
-#define CW_ADD(aid,ap,ari,asi) do { \
-    if(cw_n==cw_max)                                         \
-      array_reserve(struct crl_id,cw,cw_n+1),cw_max=cw->max, \
-      w=(struct crl_id*)cw->ptr+cw_n;                        \
-    w->id=aid, w->p=ap, w->ri=ari, w->si=asi;                \
-    ++w, ++cw_n;                                             \
-  } while(0)
-
-  for(s=sh->ptr,se=s+sh->n;s!=se;++s) {
-    int send = (s->flags&send_mask)==0;
-    int recv = (s->flags&recv_mask)==0;
-    if(s->i!=last_i) last_i=s->i, added_myself=0;
-    if(!added_myself && recv && (s->flags&FLAGS_LOCAL)==0) {
-      added_myself=1;
-      CW_ADD(s->id,this_p,s->i,s->i);
-    }
-    if(send) CW_ADD(s->id,s->p,s->ri,s->i);
-  }
-  cw->n=cw_n;
-#undef CW_ADD
-}
-
-static uint crl_maps(struct cr_stage *stage, struct array *cw, buffer *buf)
-{
-  uint mem_size=0;
-  struct crl_id *w, *we, *other;
-  uint scount=1, gcount=1, *sp, *gp;
-  sarray_sort_2(struct crl_id,cw->ptr,cw->n, bi,0, si,0, buf);
-  for(w=cw->ptr,we=w+cw->n;w!=we;w=other) {
-    uint bi=w->bi,any=0,si=w->si;
-    scount+=3;
-    for(other=w+1;other!=we&&other->bi==bi;++other)
-      if(other->si!=si) si=other->si, any=2, ++gcount;
-    gcount+=any;
-  }
-  stage->scatter_map = sp = tmalloc(uint,scount+gcount);
-  stage->gather_map  = gp = sp + scount;
-  mem_size += (scount+gcount)*sizeof(uint);
-  for(w=cw->ptr,we=w+cw->n;w!=we;w=other) {
-    uint bi=w->bi,any=0,si=w->si;
-    *sp++ = w->si, *sp++ = bi;
-    *gp++ = bi;
-    for(other=w+1;other!=we&&other->bi==bi;++other)
-      if(other->si!=si) si=other->si, any=1, *gp++ = si;
-    if(any) *gp++ = UINT_MAX; else --gp;
-    *sp++ = UINT_MAX;
-  }
-  *sp=UINT_MAX, *gp=UINT_MAX;
-  return mem_size;
-}
-
-static uint crl_work_label(struct array *cw, struct cr_stage *stage,
-                           uint cutoff, int send_hi, buffer *buf,
-                           uint *mem_size)
-{
-  struct crl_id *w, *we, *start;
-  uint nsend, nkeep = 0, nks = 0, bi=0;
-  /* here w->send has a reverse meaning */
-  if(send_hi) for(w=cw->ptr,we=w+cw->n;w!=we;++w) w->send = w->p< cutoff;
-         else for(w=cw->ptr,we=w+cw->n;w!=we;++w) w->send = w->p>=cutoff;
-  sarray_sort_2(struct crl_id,cw->ptr,cw->n, id,1, send,0, buf);
-  for(start=cw->ptr,w=start,we=w+cw->n;w!=we;++w) {
-    nkeep += w->send;
-    if(w->id!=start->id) start=w;
-    if(w->send!=start->send) w->send=0,w->bi=1, ++nks; else w->bi=0;
-  }
-  nsend = cw->n-nkeep;
-  /* assign indices; sent ids have priority (hence w->send is reversed) */
-  sarray_sort(struct crl_id,cw->ptr,cw->n, send,0, buf);
-  for(start=cw->ptr,w=start,we=w+nsend+nks;w!=we;++w) {
-    if(w->id!=start->id) start=w, ++bi;
-    if(w->bi!=1) w->send=1;   /* switch back to the usual semantics */
-    w->bi = bi;
-  }
-  stage->size_s = nsend+nks==0 ? 0 : bi+1;
-  for(we=(struct crl_id*)cw->ptr+cw->n;w!=we;++w) {
-    if(w->id!=start->id) start=w, ++bi;
-    w->send = 0;              /* switch back to the usual semantics */
-    w->bi = bi;
-  }
-  stage->size_sk = cw->n==0 ? 0 : bi+1;
-  *mem_size += crl_maps(stage,cw,buf);
-  return nsend;
-}
-
-static void crl_bi_to_si(struct crl_id *w, uint n, uint v) {
-  for(;n;--n) w->si=w->bi+v, ++w;
-}
-
-static void crl_ri_to_bi(struct crl_id *w, uint n) {
-  for(;n;--n) w->bi=w->ri, ++w;
-}
-
-static uint cr_learn(struct array *cw, struct cr_stage *stage,
-                     const struct comm *comm, buffer *buf, uint *mem_size)
-{
-  comm_req req[3];
-  const uint id = comm->id;
-  uint bl=0, n=comm->np;
-  uint size_max=0;
-  uint tag = comm->np;
-  while(n>1) {
-    uint nl = (n+1)/2, bh = bl+nl;
-    uint nkeep, nsend[2], nrecv[2][2] = {{0,0},{0,0}};
-    struct crl_id *wrecv[2], *wsend;
-    nsend[0] = crl_work_label(cw,stage,bh,id<bh,buf, mem_size);
-    nsend[1] = stage->size_s;
-    nkeep = cw->n - nsend[0];
-
-    if(stage->nrecvn   ) comm_irecv(&req[1],comm,nrecv[0],2*sizeof(uint),
-                                    stage->p1,tag);
-    if(stage->nrecvn==2) comm_irecv(&req[2],comm,nrecv[1],2*sizeof(uint),
-                                    stage->p2,tag);
-    comm_isend(&req[0],comm,nsend,2*sizeof(uint),stage->p1,tag);
-    comm_wait(req,1+stage->nrecvn),++tag;
-
-    stage->size_r1 = nrecv[0][1], stage->size_r2 = nrecv[1][1];
-    stage->size_r = stage->size_r1 + stage->size_r2;
-    stage->size_total = stage->size_r + stage->size_sk;
-    if(stage->size_total>size_max) size_max=stage->size_total;
-
-    array_reserve(struct crl_id,cw,cw->n+nrecv[0][0]+nrecv[1][0]);
-    wrecv[0] = cw->ptr, wrecv[0] += cw->n, wrecv[1] = wrecv[0]+nrecv[0][0];
-    wsend = cw->ptr, wsend += nkeep;
-    if(stage->nrecvn   )
-      comm_irecv(&req[1],comm,wrecv[0],nrecv[0][0]*sizeof(struct crl_id),
-                 stage->p1,tag);
-    if(stage->nrecvn==2)
-      comm_irecv(&req[2],comm,wrecv[1],nrecv[1][0]*sizeof(struct crl_id),
-                 stage->p2,tag);
-    sarray_sort_2(struct crl_id,cw->ptr,cw->n, send,0, bi,0, buf);
-    comm_isend(&req[0],comm,wsend,nsend[0]*sizeof(struct crl_id),stage->p1,tag);
-    comm_wait(req,1+stage->nrecvn),++tag;
-
-    crl_bi_to_si(cw->ptr,nkeep,stage->size_r);
-    if(stage->nrecvn)    crl_bi_to_si(wrecv[0],nrecv[0][0],0);
-    if(stage->nrecvn==2) crl_bi_to_si(wrecv[1],nrecv[1][0],stage->size_r1);
-    memmove(wsend,wrecv[0],(nrecv[0][0]+nrecv[1][0])*sizeof(struct crl_id));
-    cw->n += nrecv[0][0] + nrecv[1][0];
-    cw->n -= nsend[0];
-
-    if(id<bh) n=nl; else n-=nl,bl=bh;
-    ++stage;
-  }
-  crl_ri_to_bi(cw->ptr,cw->n);
-  *mem_size += crl_maps(stage,cw,buf);
-  return size_max;
-}
-
-static struct cr_data *cr_setup_aux(
-  struct array *sh, const struct comm *comm, buffer *buf, uint *mem_size)
-{
-  uint size_max[2];
-  struct array cw = null_array;
-  struct cr_data *crd = tmalloc(struct cr_data,1);
-  *mem_size = sizeof(struct cr_data);
-
-  /* default behavior: receive only remotely unflagged data */
-  /* default behavior: send only locally unflagged data */
-
-  *mem_size += cr_schedule(crd,comm);
-
-  sarray_sort(struct shared_id,sh->ptr,sh->n, i,0, buf);
-  crl_work_init(&cw,sh, FLAGS_LOCAL , comm->id);
-  size_max[0]=cr_learn(&cw,crd->stage[0],comm,buf, mem_size);
-  crl_work_init(&cw,sh, FLAGS_REMOTE, comm->id);
-  size_max[1]=cr_learn(&cw,crd->stage[1],comm,buf, mem_size);
-
-  crd->stage_buffer_size = size_max[1]>size_max[0]?size_max[1]:size_max[0];
-
-  array_free(&cw);
-
-  crd->buffer_size = 2*crd->stage_buffer_size;
-  return crd;
-}
-
-static void cr_free_stage_maps(struct cr_stage *stage, unsigned kmax)
-{
-  unsigned k;
-  for(k=0; k<kmax; ++k) {
-    free((uint*)stage->scatter_map);
-    ++stage;
-  }
-  free((uint*)stage->scatter_map);
-}
-
-static void cr_free(struct cr_data *data)
-{
-  cr_free_stage_maps(data->stage[0],data->nstages);
-  cr_free_stage_maps(data->stage[1],data->nstages);
-  free(data->stage[0]);
-  free(data);
-}
-
-static void cr_setup(struct gs_remote *r, struct gs_topology *top,
-                     const struct comm *comm, buffer *buf)
-{
-  struct cr_data *crd = cr_setup_aux(&top->sh,comm,buf, &r->mem_size);
-  r->buffer_size = crd->buffer_size;
-  r->data = crd;
-  r->exec = (exec_fun*)&cr_exec;
-  r->fin = (fin_fun*)&cr_free;
-}
-
-/*------------------------------------------------------------------------------
-  All-reduce Execution
-------------------------------------------------------------------------------*/
-struct allreduce_data {
-  const uint *map_to_buf[2], *map_from_buf[2];
-  uint buffer_size;
-  comm_req *req;
-};
-
-static void allreduce_exec(
-  void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op,
-  unsigned transpose, const void *execdata, const struct comm *comm, char *buf)
-{
-  const struct allreduce_data *ard = execdata;
-  static gs_scatter_fun *const scatter_to_buf[] =
-    { &gs_scatter, &gs_scatter_vec, &gs_scatter_many_to_vec, &scatter_noop };
-  static gs_scatter_fun *const scatter_from_buf[] =
-    { &gs_scatter, &gs_scatter_vec, &gs_scatter_vec_to_many, &scatter_noop };
-  uint gvn = vn*(ard->buffer_size/2);
-  unsigned unit_size = gs_dom_size[dom];
-  char *ardbuf;
-  ardbuf = buf+unit_size*gvn;
-  /* user array -> buffer */
-  gs_init_array(buf,gvn,dom,op);
-  scatter_to_buf[mode](buf,data,vn,ard->map_to_buf[transpose],dom);
-  /* all reduce */
-  comm_allreduce(comm,dom,op, buf,gvn, ardbuf);
-  /* buffer -> user array */
-  scatter_from_buf[mode](data,buf,vn,ard->map_from_buf[transpose],dom);
-}
-
-/*------------------------------------------------------------------------------
-  Nonblocking All-reduce Execution
-------------------------------------------------------------------------------*/
-static void allreduce_exec_i(
-  void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op,
-  unsigned transpose, const void *execdata, const struct comm *comm, char *buf)
-{
-  const struct allreduce_data *ard = execdata;
-  static gs_scatter_fun *const scatter_to_buf[] =
-    { &gs_scatter, &gs_scatter_vec, &gs_scatter_many_to_vec, &scatter_noop };
-  // static gs_scatter_fun *const scatter_from_buf[] =
-  //   { &gs_scatter, &gs_scatter_vec, &gs_scatter_vec_to_many, &scatter_noop };
-  uint gvn = vn*(ard->buffer_size/2);
-  unsigned unit_size = gs_dom_size[dom];
-  char *ardbuf = buf+unit_size*gvn;
-
-  /* user array -> buffer */
-  gs_init_array(buf,gvn,dom,op);
-  scatter_to_buf[mode](buf,data,vn,ard->map_to_buf[transpose],dom);
-  /* all reduce */
-  comm_iallreduce(ard->req,comm,dom,op,buf,gvn,ardbuf);
-}
-
-static void allreduce_exec_wait(
-  void *data, gs_mode mode, unsigned vn, gs_dom dom, gs_op op,
-  unsigned transpose, const void *execdata, const struct comm *comm, char *buf)
-{
-  const struct allreduce_data *ard = execdata;
-  // static gs_scatter_fun *const scatter_to_buf[] =
-  //   { &gs_scatter, &gs_scatter_vec, &gs_scatter_many_to_vec, &scatter_noop };
-  static gs_scatter_fun *const scatter_from_buf[] =
-    { &gs_scatter, &gs_scatter_vec, &gs_scatter_vec_to_many, &scatter_noop };
-  uint gvn = vn*(ard->buffer_size/2);
-  unsigned unit_size = gs_dom_size[dom];
-  char *ardbuf = buf+unit_size*gvn;
-
-  // Why do I need this? Ugly !!!
-  if (comm->np > 1)
-    comm_wait(ard->req, 1);
-#ifdef MPI
-  memcpy(buf,ardbuf,gvn*gs_dom_size[dom]);
-#endif
-  /* buffer -> user array */
-  scatter_from_buf[mode](data,buf,vn,ard->map_from_buf[transpose],dom);
-}
-
-/*------------------------------------------------------------------------------
-  All-reduce setup
-------------------------------------------------------------------------------*/
-static const uint *allreduce_map_setup(
-  struct array *pr, const unsigned flags_mask, int to_buf, uint *mem_size)
-{
-  struct primary_shared_id *p, *pe;
-  uint count=1, *map, *m;
-  for(p=pr->ptr,pe=p+pr->n;p!=pe;++p)
-    if((p->flag&flags_mask)==0) count+=3;
-  m=map=tmalloc(uint,count); *mem_size += count*sizeof(uint);
-  if(to_buf) {
-    for(p=pr->ptr,pe=p+pr->n;p!=pe;++p)
-      if((p->flag&flags_mask)==0)
-        *m++ = p->i, *m++ = p->ord, *m++ = UINT_MAX;
-  } else {
-    for(p=pr->ptr,pe=p+pr->n;p!=pe;++p)
-      if((p->flag&flags_mask)==0)
-        *m++ = p->ord, *m++ = p->i, *m++ = UINT_MAX;
-  }
-  *m=UINT_MAX;
-  return map;
-}
-
-static struct allreduce_data *allreduce_setup_aux(
-  struct array *pr, ulong total_shared, uint *mem_size)
-{
-  struct allreduce_data *ard = tmalloc(struct allreduce_data,1);
-  *mem_size = sizeof(struct allreduce_data);
-
-  /* default behavior: reduce only unflagged data, copy to all */
-  ard->map_to_buf  [0] = allreduce_map_setup(pr,1,1, mem_size);
-  ard->map_from_buf[0] = allreduce_map_setup(pr,0,0, mem_size);
-
-  /* transpose behavior: reduce all data, copy to unflagged */
-  ard->map_to_buf  [1] = allreduce_map_setup(pr,0,1, mem_size);
-  ard->map_from_buf[1] = allreduce_map_setup(pr,1,0, mem_size);
-
-  ard->req = tmalloc(comm_req, 1);
-
-  ard->buffer_size = total_shared*2;
-  return ard;
-}
-
-static void allreduce_free(struct allreduce_data *ard)
-{
-  free((uint*)ard->map_to_buf[0]);
-  free((uint*)ard->map_to_buf[1]);
-  free((uint*)ard->map_from_buf[0]);
-  free((uint*)ard->map_from_buf[1]);
-  free(ard);
-}
-
-static void allreduce_setup(struct gs_remote *r, struct gs_topology *top,
-                            const struct comm *comm, buffer *buf)
-{
-  struct allreduce_data *ard
-    = allreduce_setup_aux(&top->pr,top->total_shared, &r->mem_size);
-  r->buffer_size = ard->buffer_size;
-  r->data = ard;
-  r->exec = (exec_fun*)&allreduce_exec;
-  r->exec_irecv = (exec_fun*)&allreduce_exec_i;
-  r->exec_isend = NULL;
-  r->exec_wait = (exec_fun*)&allreduce_exec_wait;
-  r->fin = (fin_fun*)&allreduce_free;
-}
-
-/*------------------------------------------------------------------------------
-  Automatic Setup --- dynamically picks the fastest method
-------------------------------------------------------------------------------*/
-
-static void dry_run_time(double times[3], const struct gs_remote *r,
-                         const struct comm *comm, buffer *buf)
-{
-  int i; double t;
-  buffer_reserve(buf,gs_dom_size[gs_double]*r->buffer_size);
-  for(i= 2;i;--i)
-    r->exec(0,mode_dry_run,1,gs_double,gs_add,0,r->data,comm,buf->ptr);
-  comm_barrier(comm);
-  t = comm_time();
-  for(i=10;i;--i)
-    r->exec(0,mode_dry_run,1,gs_double,gs_add,0,r->data,comm,buf->ptr);
-  t = (comm_time() - t)/10;
-  times[0] = t/comm->np, times[1] = t, times[2] = t;
-  comm_allreduce(comm,gs_double,gs_add, &times[0],1, &t);
-  comm_allreduce(comm,gs_double,gs_min, &times[1],1, &t);
-  comm_allreduce(comm,gs_double,gs_max, &times[2],1, &t);
-}
-
-static void auto_setup(struct gs_remote *r, struct gs_topology *top,
-                       const struct comm *comm, buffer *buf)
-{
-  pw_setup(r, top,comm,buf);
-
-  if(comm->np>1) {
-    // const char *name = "pairwise";
-    struct gs_remote r_alt;
-    double time[2][3];
-
-#if 0
-    #define DRY_RUN(i,gsr,str) do { \
-      if(comm->id==0) printf("   " str ": "); \
-      dry_run_time(time[i],gsr,comm,buf); \
-      if(comm->id==0) \
-        printf("%g %g %g\n",time[i][0],time[i][1],time[i][2]); \
-    } while(0)
-#endif
-
-    #define DRY_RUN(i,gsr,str) do { \
-      dry_run_time(time[i],gsr,comm,buf); \
-    } while(0)
-
-    #define DRY_RUN_CHECK(str,new_name) do { \
-      DRY_RUN(1,&r_alt,str); \
-      if(time[1][2]<time[0][2]) \
-        time[0][2]=time[1][2], /*name=new_name,*/ \
-        r->fin(r->data), *r = r_alt; \
-      else \
-        r_alt.fin(r_alt.data); \
-    } while(0)
-
-    DRY_RUN(0, r, "pairwise times (avg, min, max)");
-
-    cr_setup(&r_alt, top,comm,buf);
-    DRY_RUN_CHECK(      "crystal router                ", "crystal router");
-
-    if(top->total_shared<100000) {
-      allreduce_setup(&r_alt, top,comm,buf);
-      DRY_RUN_CHECK(    "all reduce                    ", "allreduce");
-    }
-
-    #undef DRY_RUN_CHECK
-    #undef DRY_RUN
-
-    // if(comm->id==0) printf("   used all_to_all method: %s\n",name);
-  }
-}
-
-/*------------------------------------------------------------------------------
-  Main Execution
-------------------------------------------------------------------------------*/
-struct gs_data {
-  struct comm comm;
-  const uint *map_local[2]; /* 0=unflagged, 1=all */
-  const uint *flagged_primaries;
-  struct gs_remote r;
-  uint handle_size;
-};
-
-/*------------------------------------------------------------------------------
-  GS_AUX - blocking and non-blocking
-------------------------------------------------------------------------------*/
-static void gs_aux(
-  void *u, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, unsigned transpose,
-  struct gs_data *gsh, buffer *buf)
-{
-  static gs_scatter_fun *const local_scatter[] =
-    { &gs_scatter, &gs_scatter_vec, &gs_scatter_many, &scatter_noop };
-  static gs_gather_fun  *const local_gather [] =
-    { &gs_gather,  &gs_gather_vec,  &gs_gather_many, &gather_noop  };
-  static gs_init_fun *const init[] =
-    { &gs_init, &gs_init_vec, &gs_init_many, &init_noop };
-  if(!buf) buf = &static_buffer;
-  buffer_reserve(buf,vn*gs_dom_size[dom]*gsh->r.buffer_size);
-  local_gather [mode](u,u,vn,gsh->map_local[0^transpose],dom,op);
-  if(transpose==0) init[mode](u,vn,gsh->flagged_primaries,dom,op);
-  gsh->r.exec(u,mode,vn,dom,op,transpose,gsh->r.data,&gsh->comm,buf->ptr);
-  local_scatter[mode](u,u,vn,gsh->map_local[1^transpose],dom);
-}
-
-static void gs_aux_irecv(
-  void *u, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, unsigned transpose,
-  struct gs_data *gsh, buffer *buf)
-{
-  // static gs_scatter_fun *const local_scatter[] =
-  //   { &gs_scatter, &gs_scatter_vec, &gs_scatter_many, &scatter_noop };
-  static gs_gather_fun  *const local_gather [] =
-    { &gs_gather,  &gs_gather_vec,  &gs_gather_many, &gather_noop  };
-  static gs_init_fun *const init[] =
-    { &gs_init, &gs_init_vec, &gs_init_many, &init_noop };
-  if(!buf) buf = &static_buffer;
-  buffer_reserve(buf,vn*gs_dom_size[dom]*gsh->r.buffer_size);
-  local_gather [mode](u,u,vn,gsh->map_local[0^transpose],dom,op);
-  if(transpose==0) init[mode](u,vn,gsh->flagged_primaries,dom,op);
-
-  if (gsh->r.exec_irecv)
-    gsh->r.exec_irecv(u,mode,vn,dom,op,transpose,gsh->r.data,&gsh->comm,buf->ptr);
-}
-
-static void gs_aux_isend(
-  void *u, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, unsigned transpose,
-  struct gs_data *gsh, buffer *buf)
-{
-  // static gs_scatter_fun *const local_scatter[] =
-  //   { &gs_scatter, &gs_scatter_vec, &gs_scatter_many, &scatter_noop };
-  // static gs_gather_fun  *const local_gather [] =
-  //   { &gs_gather,  &gs_gather_vec,  &gs_gather_many, &gather_noop  };
-  // static gs_init_fun *const init[] =
-  //   { &gs_init, &gs_init_vec, &gs_init_many, &init_noop };
-
-  if(!buf) buf = &static_buffer;
-
-  if (gsh->r.exec_isend)
-    gsh->r.exec_isend(u,mode,vn,dom,op,transpose,gsh->r.data,&gsh->comm,buf->ptr);
-}
-
-static void gs_aux_wait(
-  void *u, gs_mode mode, unsigned vn, gs_dom dom, gs_op op, unsigned transpose,
-  struct gs_data *gsh, buffer *buf)
-{
-  static gs_scatter_fun *const local_scatter[] =
-    { &gs_scatter, &gs_scatter_vec, &gs_scatter_many, &scatter_noop };
-  // static gs_gather_fun  *const local_gather [] =
-  //   { &gs_gather,  &gs_gather_vec,  &gs_gather_many, &gather_noop  };
-  // static gs_init_fun *const init[] =
-  //   { &gs_init, &gs_init_vec, &gs_init_many, &init_noop };
-
-  if(!buf) buf = &static_buffer;
-
-  if (gsh->r.exec_wait)
-    gsh->r.exec_wait(u,mode,vn,dom,op,transpose,gsh->r.data,&gsh->comm,buf->ptr);
-
-  local_scatter[mode](u,u,vn,gsh->map_local[1^transpose],dom);
-}
-
-/*------------------------------------------------------------------------------
-  GS interface - blocking and non-blocking
-------------------------------------------------------------------------------*/
-struct nonblocking_private {
-  void *u;
-  gs_mode mode;
-  gs_dom dom;
-  gs_op op;
-  unsigned transpose;
-  struct gs_data *gsh;
-  buffer *buf;
-  unsigned vn;
-};
-
-typedef struct nonblocking_private* nblkng;
-
-static nblkng *nblkng_dict;
-static int nblkng_max = 0;
-static int nblkng_n = 0;
-static int nblkng_count = 0;
-
-void gs(void *u, gs_dom dom, gs_op op, unsigned transpose,
-        struct gs_data *gsh, buffer *buf)
-{
-  gs_aux(u,mode_plain,1,dom,op,transpose,gsh,buf);
-}
-
-void igs(void *u, gs_dom dom, gs_op op, unsigned transpose,
-        struct gs_data *gsh, buffer *buf, int *handle)
-{
-  if(nblkng_n==nblkng_max) nblkng_max+=nblkng_max/2+1,
-                     nblkng_dict=trealloc(nblkng,nblkng_dict,nblkng_max);
-
-  nblkng_dict[nblkng_n] = tmalloc(struct nonblocking_private, 1);
-
-  nblkng_dict[nblkng_n]->u = u;
-  nblkng_dict[nblkng_n]->dom = dom;
-  nblkng_dict[nblkng_n]->op = op;
-  nblkng_dict[nblkng_n]->transpose = transpose;
-  nblkng_dict[nblkng_n]->gsh = gsh;
-  nblkng_dict[nblkng_n]->buf = buf;
-  nblkng_dict[nblkng_n]->mode = mode_plain;
-  nblkng_dict[nblkng_n]->vn = 1;
-
-  *handle = nblkng_n++;
-  nblkng_count++;
-
-  gs_aux_irecv(u,mode_plain,1,dom,op,transpose,gsh,buf);
-  gs_aux_isend(u,mode_plain,1,dom,op,transpose,gsh,buf);
-}
-
-void gs_wait(int handle)
-{
-  if(handle < nblkng_n) {
-    gs_aux_wait(nblkng_dict[handle]->u,
-	        nblkng_dict[handle]->mode,
-	        nblkng_dict[handle]->vn,
-	        nblkng_dict[handle]->dom,
-	        nblkng_dict[handle]->op,
-	        nblkng_dict[handle]->transpose,
-	        nblkng_dict[handle]->gsh,
-	        nblkng_dict[handle]->buf);
-    free(nblkng_dict[handle]);
-    nblkng_dict[handle] = 0;
-    nblkng_count--;
-  }
-
-  if(nblkng_count == 0) {
-    free(nblkng_dict);
-    nblkng_dict = 0;
-    nblkng_max = 0;
-    nblkng_n = 0;
-  }
-}
-/*------------------------------------------------------------------------------
-  GS_VEC interface - blocking and non-blocking
-------------------------------------------------------------------------------*/
-void gs_vec(void *u, unsigned vn, gs_dom dom, gs_op op,
-            unsigned transpose, struct gs_data *gsh, buffer *buf)
-{
-  gs_aux(u,mode_vec,vn,dom,op,transpose,gsh,buf);
-}
-
-void igs_vec(void *u, unsigned vn, gs_dom dom, gs_op op,
-            unsigned transpose, struct gs_data *gsh, buffer *buf, int *handle)
-{
-  if(nblkng_n==nblkng_max) nblkng_max+=nblkng_max/2+1,
-                     nblkng_dict=trealloc(nblkng,nblkng_dict,nblkng_max);
-
-  nblkng_dict[nblkng_n] = tmalloc(struct nonblocking_private, 1);
-
-  nblkng_dict[nblkng_n]->u = u;
-  nblkng_dict[nblkng_n]->dom = dom;
-  nblkng_dict[nblkng_n]->op = op;
-  nblkng_dict[nblkng_n]->transpose = transpose;
-  nblkng_dict[nblkng_n]->gsh = gsh;
-  nblkng_dict[nblkng_n]->buf = buf;
-  nblkng_dict[nblkng_n]->vn = vn;
-  nblkng_dict[nblkng_n]->mode = mode_vec;
-
-  *handle = nblkng_n++;
-  nblkng_count++;
-
-  gs_aux_irecv(u,mode_vec,vn,dom,op,transpose,gsh,buf);
-  gs_aux_isend(u,mode_vec,vn,dom,op,transpose,gsh,buf);
-}
-/*------------------------------------------------------------------------------
-  GS_MANY interface - blocking and non-blocking
-------------------------------------------------------------------------------*/
-void gs_many(void *const*u, unsigned vn, gs_dom dom, gs_op op,
-             unsigned transpose, struct gs_data *gsh, buffer *buf)
-{
-  gs_aux((void*)u,mode_many,vn,dom,op,transpose,gsh,buf);
-}
-
-void igs_many(void *const*u, unsigned vn, gs_dom dom, gs_op op,
-             unsigned transpose, struct gs_data *gsh, buffer *buf, int *handle)
-{
-  if(nblkng_n==nblkng_max) nblkng_max+=nblkng_max/2+1,
-                     nblkng_dict=trealloc(nblkng,nblkng_dict,nblkng_max);
-
-  nblkng_dict[nblkng_n] = tmalloc(struct nonblocking_private, 1);
-
-  nblkng_dict[nblkng_n]->u = (void *)u;
-  nblkng_dict[nblkng_n]->dom = dom;
-  nblkng_dict[nblkng_n]->op = op;
-  nblkng_dict[nblkng_n]->transpose = transpose;
-  nblkng_dict[nblkng_n]->gsh = gsh;
-  nblkng_dict[nblkng_n]->buf = buf;
-  nblkng_dict[nblkng_n]->vn = vn;
-  nblkng_dict[nblkng_n]->mode = mode_many;
-
-  *handle = nblkng_n++;
-  nblkng_count++;
-
-  gs_aux_irecv((void *)u,mode_many,vn,dom,op,transpose,gsh,buf);
-  gs_aux_isend((void *)u,mode_many,vn,dom,op,transpose,gsh,buf);
-}
-
-/*------------------------------------------------------------------------------
-  Main Setup
-------------------------------------------------------------------------------*/
-typedef enum {gs_auto, gs_pairwise, gs_crystal_router, gs_all_reduce} gs_method;
-
-static uint local_setup(struct gs_data *gsh, const struct array *nz)
-{
-  uint mem_size = 0;
-  gsh->map_local[0] = local_map(nz,1, &mem_size);
-  gsh->map_local[1] = local_map(nz,0, &mem_size);
-  gsh->flagged_primaries = flagged_primaries_map(nz, &mem_size);
-  return mem_size;
-}
-
-static void gs_setup_aux(struct gs_data *gsh, const slong *id, uint n,
-                         int unique, gs_method method, int verbose)
-{
-  static setup_fun *const remote_setup[] =
-    { &auto_setup, &pw_setup, &cr_setup, &allreduce_setup };
-
-  struct gs_topology top;
-  struct crystal cr;
-
-  crystal_init(&cr,&gsh->comm);
-
-  get_topology(&top, id,n, &cr);
-  if(unique) make_topology_unique(&top,0,gsh->comm.id,&cr.data);
-
-  gsh->handle_size = sizeof(struct gs_data);
-  gsh->handle_size += local_setup(gsh,&top.nz);
-
-  if(verbose && gsh->comm.id==0)
-    printf("gs_setup: %ld unique labels shared\n",(long)top.total_shared);
-
-  remote_setup[method](&gsh->r, &top,&gsh->comm,&cr.data);
-  gsh->handle_size += gsh->r.mem_size;
-
-  if(verbose) { /* report memory usage */
-    double avg[2],td[2]; uint min[2],max[2],ti[2];
-    avg[0] = min[0] = max[0] = gsh->handle_size;
-    avg[1] = min[1] = max[1] = sizeof(double)*gsh->r.buffer_size;
-    avg[0] /= gsh->comm.np; avg[1] /= gsh->comm.np;
-    comm_allreduce(&gsh->comm,gs_double,gs_add, avg,2, td);
-    comm_allreduce(&gsh->comm,gs_sint,gs_min, min,2, ti);
-    comm_allreduce(&gsh->comm,gs_sint,gs_max, max,2, ti);
-    if(gsh->comm.id==0) {
-      printf("   " "handle bytes (avg, min, max)" ": " "%g %u %u\n",
-        avg[0], (unsigned)min[0], (unsigned)max[0]);
-      printf("   " "buffer bytes (avg, min, max)" ": " "%g %u %u\n",
-        avg[1], (unsigned)min[1], (unsigned)max[1]);
-    }
-  }
-
-  gs_topology_free(&top);
-  crystal_free(&cr);
-}
-
-struct gs_data *gs_setup(const slong *id, uint n, const struct comm *comm,
-                         int unique, gs_method method, int verbose)
-{
-  struct gs_data *gsh = tmalloc(struct gs_data,1);
-  comm_dup(&gsh->comm,comm);
-  gs_setup_aux(gsh,id,n,unique,method,verbose);
-  return gsh;
-}
-
-void gs_free(struct gs_data *gsh)
-{
-  comm_free(&gsh->comm);
-  free((uint*)gsh->map_local[0]), free((uint*)gsh->map_local[1]);
-  free((uint*)gsh->flagged_primaries);
-  gsh->r.fin(gsh->r.data);
-  free(gsh);
-}
-
-void gs_unique(slong *id, uint n, const struct comm *comm)
-{
-  struct gs_topology top;
-  struct crystal cr;
-  crystal_init(&cr,comm);
-  get_topology(&top, id,n, &cr);
-  make_topology_unique(&top,id,comm->id,&cr.data);
-  gs_topology_free(&top);
-  crystal_free(&cr);
-}
-
-/*------------------------------------------------------------------------------
-  FORTRAN interface
-------------------------------------------------------------------------------*/
-
-#undef gs_op
-
-#undef gs_unique
-#undef gs_free
-#undef gs_setup
-#undef gs_many
-#undef gs_vec
-#undef gs
-#undef igs
-#undef igs_vec
-#undef igs_many
-#undef gs_wait
-
-#define cgs         PREFIXED_NAME(gs      )
-#define cgs_vec     PREFIXED_NAME(gs_vec  )
-#define cgs_many    PREFIXED_NAME(gs_many )
-#define cgs_setup   PREFIXED_NAME(gs_setup)
-#define cgs_free    PREFIXED_NAME(gs_free )
-#define cgs_unique  PREFIXED_NAME(gs_unique)
-#define cigs        PREFIXED_NAME(igs     )
-#define cigs_vec    PREFIXED_NAME(igs_vec )
-#define cigs_many   PREFIXED_NAME(igs_many)
-#define cgs_wait    PREFIXED_NAME(gs_wait )
-
-#define fgs_setup_pick FORTRAN_NAME(gs_setup_pick,GS_SETUP_PICK)
-#define fgs_setup      FORTRAN_NAME(gs_setup     ,GS_SETUP     )
-#define fgs            FORTRAN_NAME(gs_op        ,GS_OP        )
-#define fgs_vec        FORTRAN_NAME(gs_op_vec    ,GS_OP_VEC    )
-#define fgs_many       FORTRAN_NAME(gs_op_many   ,GS_OP_MANY   )
-#define figs           FORTRAN_NAME(igs_op       ,IGS_OP       )
-#define figs_vec       FORTRAN_NAME(igs_op_vec   ,IGS_OP_VEC   )
-#define figs_many      FORTRAN_NAME(igs_op_many  ,IGS_OP_MANY  )
-#define fgs_wait       FORTRAN_NAME(gs_op_wait   ,GS_OP_WAIT   )
-#define fgs_fields     FORTRAN_NAME(gs_op_fields ,GS_OP_FIELDS )
-#define fgs_free       FORTRAN_NAME(gs_free      ,GS_FREE      )
-#define fgs_unique     FORTRAN_NAME(gs_unique    ,GS_UNIQUE    )
-
-static struct gs_data **fgs_info = 0;
-static int fgs_max = 0;
-static int fgs_n = 0;
-
-struct gs_data* gs_hf2c(const sint gsh)
-{
-  return fgs_info[gsh];
-}
-
-void fgs_setup_pick(sint *handle, const slong id[], const sint *n,
-                    const MPI_Fint *comm, const sint *np, const sint *method)
-{
-  struct gs_data *gsh;
-  if(fgs_n==fgs_max) fgs_max+=fgs_max/2+1,
-                     fgs_info=trealloc(struct gs_data*,fgs_info,fgs_max);
-  gsh=fgs_info[fgs_n]=tmalloc(struct gs_data,1);
-  comm_init_check(&gsh->comm,*comm,*np);
-  gs_setup_aux(gsh,id,*n,0,*method,1);
-  *handle = fgs_n++;
-}
-
-void fgs_setup(sint *handle, const slong id[], const sint *n,
-               const MPI_Fint *comm, const sint *np)
-{
-  const sint method = gs_auto;
-  fgs_setup_pick(handle,id,n,comm,np,&method);
-}
-
-static void fgs_check_handle(sint handle, const char *func, unsigned line)
-{
-  if(handle<0 || handle>=fgs_n || !fgs_info[handle])
-    fail(1,__FILE__,line,"%s: invalid handle", func);
-}
-
-static const gs_dom fgs_dom[4] = { 0, gs_double, gs_sint, gs_slong };
-
-static void fgs_check_parms(sint handle, sint dom, sint op,
-                            const char *func, unsigned line)
-{
-  if(dom<1 || dom>3)
-    fail(1,__FILE__,line,"%s: datatype %d not in valid range 1-3",func,dom);
-  if(op <1 || op >4)
-    fail(1,__FILE__,line,"%s: op %d not in valid range 1-4",func,op);
-  fgs_check_handle(handle,func,line);
-}
-
-void fgs(const sint *handle, void *u, const sint *dom, const sint *op,
-         const sint *transpose)
-{
-  fgs_check_parms(*handle,*dom,*op,"gs_op",__LINE__);
-  cgs(u,fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0,fgs_info[*handle],0);
-}
-
-void figs(const sint *handle, void *u, const sint *dom, const sint *op,
-         const sint *transpose, int *wait)
-{
-  fgs_check_parms(*handle,*dom,*op,"gs_op",__LINE__);
-  cigs(u,fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0,fgs_info[*handle],0,wait);
-}
-
-void fgs_vec(const sint *handle, void *u, const sint *n,
-             const sint *dom, const sint *op, const sint *transpose)
-{
-  fgs_check_parms(*handle,*dom,*op,"gs_op_vec",__LINE__);
-  cgs_vec(u,*n,fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0,
-          fgs_info[*handle],0);
-}
-
-void figs_vec(const sint *handle, void *u, const sint *n,
-             const sint *dom, const sint *op, const sint *transpose, int *wait)
-{
-  fgs_check_parms(*handle,*dom,*op,"gs_op_vec",__LINE__);
-  cigs_vec(u,*n,fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0,
-          fgs_info[*handle],0,wait);
-}
-
-void fgs_many(const sint *handle, void *u1, void *u2, void *u3,
-              void *u4, void *u5, void *u6, const sint *n,
-              const sint *dom, const sint *op, const sint *transpose)
-{
-  void *uu[6];
-  uu[0]=u1,uu[1]=u2,uu[2]=u3,uu[3]=u4,uu[4]=u5,uu[5]=u6;
-  fgs_check_parms(*handle,*dom,*op,"gs_op_many",__LINE__);
-  cgs_many((void *const*)uu,*n,fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0,
-           fgs_info[*handle],0);
-}
-
-void figs_many(const sint *handle, void *u1, void *u2, void *u3,
-              void *u4, void *u5, void *u6, const sint *n,
-              const sint *dom, const sint *op, const sint *transpose,
-	      int *wait)
-{
-  void *uu[6];
-  uu[0]=u1,uu[1]=u2,uu[2]=u3,uu[3]=u4,uu[4]=u5,uu[5]=u6;
-  fgs_check_parms(*handle,*dom,*op,"gs_op_many",__LINE__);
-  cigs_many((void *const*)uu,*n,fgs_dom[*dom],(gs_op_t)(*op-1),*transpose!=0,
-           fgs_info[*handle],0,wait);
-}
-
-void fgs_wait(int *handle)
-{
-  cgs_wait(*handle);
-}
-
-static struct array fgs_fields_array = null_array;
-
-void fgs_fields(const sint *handle,
-                void *u, const sint *stride, const sint *n,
-                const sint *dom, const sint *op, const sint *transpose)
-{
-  size_t offset;
-  void **p;
-  uint i;
-
-  fgs_check_parms(*handle,*dom,*op,"gs_op_fields",__LINE__);
-  if(*n<0) return;
-
-  array_reserve(void*,&fgs_fields_array,*n);
-  p = fgs_fields_array.ptr;
-  offset = *stride * gs_dom_size[*dom-1];
-  for(i=*n;i;--i) *p++ = u, u = (char*)u + offset;
-
-  cgs_many((void *const*)fgs_fields_array.ptr,*n,
-           fgs_dom[*dom],(gs_op_t)(*op-1),
-           *transpose!=0, fgs_info[*handle],0);
-}
-
-void fgs_free(const sint *handle)
-{
-  fgs_check_handle(*handle,"gs_free",__LINE__);
-  cgs_free(fgs_info[*handle]);
-  fgs_info[*handle] = 0;
-}
-
-void fgs_unique(slong id[], const sint *n, const MPI_Fint *c, const sint *np)
-{
-  struct comm *comm;
-  uint un = *n;
-  comm = tmalloc(struct comm, 1);
-  comm_init_check(comm, *c, *np);
-  cgs_unique(id, un, comm);
-  free(comm);
-}
diff --git a/3rdParty/gslib/src/gs.h b/3rdParty/gslib/src/gs.h
deleted file mode 100644
index 5598e589c..000000000
--- a/3rdParty/gslib/src/gs.h
+++ /dev/null
@@ -1,155 +0,0 @@
-#ifndef GS_H
-#define GS_H
-
-#if !defined(COMM_H) || !defined(GS_DEFS_H) || !defined(MEM_H)
-#warning "gs.h" requires "comm.h", "gs_defs.h", and "mem.h"
-#endif
-
-/*
-  Gather/Scatter Library
-
-  The code
-  
-    struct comm c;  // see "comm.h"
-    slong id[n];    // the slong type is defined in "types.h"
-    ...
-    struct gs_data *g = gs_setup(id,n, &c, 0,gs_auto,1);
-    
-  defines a partition of the set of (processor, local index) pairs,
-    (p,i) \in S_j  iff   abs(id[i]) == j  on processor p
-  That is, all (p,i) pairs are grouped together (in group S_j) that have the
-    same id (=j).
-  S_0 is treated specially --- it is ignored completely
-    (i.e., when id[i] == 0, local index i does not participate in any
-    gather/scatter operation
-  If id[i] on proc p is negative then the pair (p,i) is "flagged". This
-  determines the non-symmetric behavior. For the simpler, symmetric case,
-  all id's should be positive.
-  
-  The second to last argument to gs_setup is the method to use, one of
-    gs_pairwise, gs_crystal_router, gs_all_reduce, gs_auto
-  The method "gs_auto" tries ~10 runs of each and chooses the fastest.
-  For a single-use handle, it makes more sense to use "gs_crystal_router".
-  
-  When "g" is no longer needed, free it with
-  
-    gs_free(g);
-  
-  A basic gather/scatter operation is, e.g.,
-  
-    double v[n]; buffer buf;  // see "mem.h" for "buffer"
-    ...
-    gs(v, gs_double,gs_add, 0, g,&buf);
-    
-  The buffer pointer can be null, in which case, a static buffer is used,
-  shared across all gs handles.
-  This gs call has the effect, (in the simple, symmetric, unflagged case)
-  
-    v[i] <--  \sum_{ (p,j) \in S_{id[i]} } v_(p) [j]
-    
-  where v_(p) [j] means v[j] on proc p. In other words, every v[i] is replaced
-  by the sum of all v[j]'s with the same id, given by id[i]. This accomplishes
-  "direct stiffness summation" corresponding to the action of QQ^T, where
-  "Q" is a boolean matrix that copies from a global vector (indexed by id)
-  to the local vectors indexed by (p,i) pairs.
-  
-  Summation on doubles is not the only operation and datatype supported. The
-  full list is defined in "gs_defs.h", and includes the operations
-    gs_add, gs_mul, gs_max, gs_min
-  and datatypes
-    gs_double, gs_float, gs_int, gs_long, gs_sint, gs_slong.
-  (The int and long types are the plain C types, whereas sint and slong
-   are defined in "types.h").
-   
-  For the nonsymmetric behavior, the "transpose" parameter is important:
-  
-    gs(v, gs_double,gs_add, transpose, g,&buf);
-    
-  When transpose == 0, any "flagged" (p,i) pairs (id[i] negative on p)
-  do not participate in the sum, but *do* still receive the sum on output.
-  As a special case, when only one (p,i) pair is unflagged per group this
-  corresponds to the rectangular "Q" matrix referred to above.
-  
-  When transpose == 1, the "flagged" (p,i) pairs *do* participate in the sum,
-  but do *not* get set on output. In the special case of only one unflagged
-  (p,i) pair, this corresponds to the transpose of "Q" referred to above.
-
-
-
-  A version for vectors (contiguously packed) is, e.g.,
-  
-    double v[n][k];
-    gs_vec(v,k, gs_double,gs_add, transpose, g,&buf);
-  
-  which is like "gs" operating on the datatype double[k],
-  with summation here being vector summation. Number of messages sent
-  is independent of k.
-  
-  For combining the communication for "gs" on multiple arrays:
-  
-    double v1[n], v2[n], ..., vk[n];
-    double (*vs)[k] = {v1, v2, ..., vk};
-    
-    gs_many(vs,k, gs_double,op, t, g,&buf);
-  
-  This call is equivalent to
-  
-    gs(v1, gs_double,op, t, g, &buf);
-    gs(v2, gs_double,op, t, g, &buf);
-    ...
-    gs(vk, gs_double,op, t, g, &buf);
-    
-  except that all communication is done together.
-  
-
-
-  Finally, gs_unique has the same basic signature as gs_setup:
-  
-    gs_unique(id,n, &c);
-    
-  This call modifies id, "flagging" (by negating id[i]) all (p,i) pairs in
-  each group except one. The sole "unflagged" member of the group is chosen
-  in an arbitrary but consistent way. If the "unique" flag is set when
-  calling gs_setup, the behavior is equivalent to first calling gs_unique,
-  except that the id array is left unmodified.
-  
-
-*/  
-
-#define gs         PREFIXED_NAME(gs       )
-#define gs_vec     PREFIXED_NAME(gs_vec   )
-#define gs_many    PREFIXED_NAME(gs_many  )
-#define igs        PREFIXED_NAME(igs      )
-#define igs_vec    PREFIXED_NAME(igs_vec  )
-#define igs_many   PREFIXED_NAME(igs_many )
-#define gs_wait    PREFIXED_NAME(gs_wait  )
-#define gs_setup   PREFIXED_NAME(gs_setup )
-#define gs_free    PREFIXED_NAME(gs_free  )
-#define gs_unique  PREFIXED_NAME(gs_unique)
-#define gs_hf2c    PREFIXED_NAME(gs_hf2c  )
-
-struct gs_data;
-typedef enum {gs_auto, gs_pairwise, gs_crystal_router, gs_all_reduce} gs_method;
-
-void gs(void *u, gs_dom dom, gs_op op, unsigned transpose,
-        struct gs_data *gsh, buffer *buf);
-void gs_vec(void *u, unsigned vn, gs_dom dom, gs_op op,
-            unsigned transpose, struct gs_data *gsh, buffer *buf);
-void gs_many(void *const*u, unsigned vn, gs_dom dom, gs_op op,
-             unsigned transpose, struct gs_data *gsh, buffer *buf);
-
-void igs(void *u, gs_dom dom, gs_op op, unsigned transpose,
-        struct gs_data *gsh, buffer *buf, int *handle);
-void igs_vec(void *u, unsigned vn, gs_dom dom, gs_op op,
-            unsigned transpose, struct gs_data *gsh, buffer *buf, int *handle);
-void igs_many(void *const*u, unsigned vn, gs_dom dom, gs_op op,
-             unsigned transpose, struct gs_data *gsh, buffer *buf, int *handle);
-void gs_wait(int handle);
-
-struct gs_data *gs_setup(const slong *id, uint n, const struct comm *comm,
-                         int unique, gs_method method, int verbose);
-void gs_free(struct gs_data *gsh);
-void gs_unique(slong *id, uint n, const struct comm *comm);
-struct gs_data* gs_hf2c(const sint gsh);
-
-#endif
diff --git a/3rdParty/gslib/src/gs_defs.h b/3rdParty/gslib/src/gs_defs.h
deleted file mode 100644
index df4ad7be4..000000000
--- a/3rdParty/gslib/src/gs_defs.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef GS_DEFS_H
-#define GS_DEFS_H
-
-/* requires:
-     <limits.h>, <float.h>   for GS_DEFINE_IDENTITIES()
-     "types.h"               for gs_sint, gs_slong
-*/
-   
-/*------------------------------------------------------------------------------
-  Monoid Definitions
-  
-  Here are defined the domains and operations, each combination being a
-  commutative semigroup, as well as the identity element making each a 
-  commutative monoid.
-------------------------------------------------------------------------------*/
-
-/* the supported domains */
-#define GS_FOR_EACH_DOMAIN(macro) \
-  macro(double) \
-  macro(float ) \
-  macro(int   ) \
-  macro(long  ) \
-  WHEN_LONG_LONG(macro(long_long))
-  
-/* the supported ops */
-#define GS_FOR_EACH_OP(T,macro) \
-  macro(T,add) \
-  macro(T,mul) \
-  macro(T,min) \
-  macro(T,max) \
-  macro(T,bpr)
-
-#define GS_DO_add(a,b) a+=b
-#define GS_DO_mul(a,b) a*=b
-#define GS_DO_min(a,b) if(b<a) a=b
-#define GS_DO_max(a,b) if(b>a) a=b
-#define GS_DO_bpr(a,b) \
-  do if(b!=0) { uint a_ = a; uint b_ = b; \
-       if(a_==0) { a=b_; break; } \
-       for(;;) { if(a_<b_) b_>>=1; else if(b_<a_) a_>>=1; else break; } \
-       a = a_; \
-     } while(0)
-
-/* the monoid identity elements */
-#define GS_DEFINE_MONOID_ID(T,min,max) \
-  static const T gs_identity_##T[] = { 0, 1, max, min, 0 };
-#define GS_DEFINE_IDENTITIES() \
-  GS_DEFINE_MONOID_ID(double, -DBL_MAX,  DBL_MAX) \
-  GS_DEFINE_MONOID_ID(float , -FLT_MAX,  FLT_MAX) \
-  GS_DEFINE_MONOID_ID(int   ,  INT_MIN,  INT_MAX) \
-  GS_DEFINE_MONOID_ID(long  , LONG_MIN, LONG_MAX) \
-  WHEN_LONG_LONG(GS_DEFINE_MONOID_ID(long_long,LLONG_MIN,LLONG_MAX))
-
-/*------------------------------------------------------------------------------
-  Enums and constants
-------------------------------------------------------------------------------*/
-
-/* domain enum */
-#define LIST GS_FOR_EACH_DOMAIN(ITEM) gs_dom_n
-#define ITEM(T) gs_##T,
-typedef enum { LIST } gs_dom;
-#undef ITEM
-#undef LIST
-
-#define gs_sint   TYPE_LOCAL(gs_int,gs_long,gs_long_long)
-#define gs_slong TYPE_GLOBAL(gs_int,gs_long,gs_long_long)
-
-/* domain type size array */
-#define GS_DOM_SIZE_ITEM(T) sizeof(T),
-#define GS_DEFINE_DOM_SIZES() \
-  static const unsigned gs_dom_size[] = \
-    { GS_FOR_EACH_DOMAIN(GS_DOM_SIZE_ITEM) 0 };
-
-/* operation enum */
-#define LIST GS_FOR_EACH_OP(T,ITEM) gs_op_n
-#define ITEM(T,op) gs_##op,
-typedef enum { LIST } gs_op;
-#undef ITEM
-#undef LIST
-
-#endif
diff --git a/3rdParty/gslib/src/gs_local.c b/3rdParty/gslib/src/gs_local.c
deleted file mode 100644
index 170e94d4c..000000000
--- a/3rdParty/gslib/src/gs_local.c
+++ /dev/null
@@ -1,336 +0,0 @@
-#include <string.h>
-#include <limits.h>
-#include <float.h>
-#include "c99.h"
-#include "name.h"
-#include "types.h"
-
-#define gs_gather_array        PREFIXED_NAME(gs_gather_array       )
-#define gs_init_array          PREFIXED_NAME(gs_init_array         )
-#define gs_gather              PREFIXED_NAME(gs_gather             )
-#define gs_scatter             PREFIXED_NAME(gs_scatter            )
-#define gs_init                PREFIXED_NAME(gs_init               )
-#define gs_gather_vec          PREFIXED_NAME(gs_gather_vec         )
-#define gs_scatter_vec         PREFIXED_NAME(gs_scatter_vec        )
-#define gs_init_vec            PREFIXED_NAME(gs_init_vec           )
-#define gs_gather_many         PREFIXED_NAME(gs_gather_many        )
-#define gs_scatter_many        PREFIXED_NAME(gs_scatter_many       )
-#define gs_init_many           PREFIXED_NAME(gs_init_many          )
-#define gs_gather_vec_to_many  PREFIXED_NAME(gs_gather_vec_to_many )
-#define gs_scatter_many_to_vec PREFIXED_NAME(gs_scatter_many_to_vec)
-#define gs_scatter_vec_to_many PREFIXED_NAME(gs_scatter_vec_to_many)
-
-#include "gs_defs.h"
-GS_DEFINE_IDENTITIES()
-GS_DEFINE_DOM_SIZES()
-
-/*------------------------------------------------------------------------------
-  The array gather kernel
-------------------------------------------------------------------------------*/
-#define DEFINE_GATHER(T,OP) \
-static void gather_array_##T##_##OP( \
-  T *restrict out, const T *restrict in, uint n) \
-{                                                                \
-  for(;n;--n) { T q = *in++, *p = out++; GS_DO_##OP(*p,q); }      \
-}
-
-/*------------------------------------------------------------------------------
-  The array initialization kernel
-------------------------------------------------------------------------------*/
-#define DEFINE_INIT(T) \
-static void init_array_##T(T *restrict out, uint n, gs_op op) \
-{                                                             \
-  const T e = gs_identity_##T[op];                            \
-  for(;n;--n) *out++=e;                                       \
-}
-
-#define DEFINE_PROCS(T) \
-  GS_FOR_EACH_OP(T,DEFINE_GATHER) \
-  DEFINE_INIT(T)
-
-GS_FOR_EACH_DOMAIN(DEFINE_PROCS)
-
-#undef DEFINE_PROCS
-#undef DEFINE_INIT
-#undef DEFINE_GATHER
-
-/*------------------------------------------------------------------------------
-  The basic gather kernel
-------------------------------------------------------------------------------*/
-#define DEFINE_GATHER(T,OP) \
-static void gather_##T##_##OP( \
-  T *restrict out, const T *restrict in, const unsigned in_stride,           \
-  const uint *restrict map)                                                  \
-{                                                                            \
-  uint i,j;                                                                  \
-  while((i=*map++)!=UINT_MAX) {                                              \
-    T t=out[i];                                                              \
-    j=*map++; do GS_DO_##OP(t,in[j*in_stride]); while((j=*map++)!=UINT_MAX); \
-    out[i]=t;                                                                \
-  }                                                                          \
-}
-
-/*------------------------------------------------------------------------------
-  The basic scatter kernel
-------------------------------------------------------------------------------*/
-#define DEFINE_SCATTER(T) \
-static void scatter_##T( \
-  T *restrict out, const unsigned out_stride,                      \
-  const T *restrict in, const unsigned in_stride,                  \
-  const uint *restrict map)                                        \
-{                                                                  \
-  uint i,j;                                                        \
-  while((i=*map++)!=UINT_MAX) {                                    \
-    T t=in[i*in_stride];                                           \
-    j=*map++; do out[j*out_stride]=t; while((j=*map++)!=UINT_MAX); \
-  }                                                                \
-}
-
-/*------------------------------------------------------------------------------
-  The basic initialization kernel
-------------------------------------------------------------------------------*/
-#define DEFINE_INIT(T) \
-static void init_##T(T *restrict out, const uint *restrict map, gs_op op) \
-{                                                       \
-  uint i; const T e = gs_identity_##T[op];              \
-  while((i=*map++)!=UINT_MAX) out[i]=e;                 \
-}
-
-#define DEFINE_PROCS(T) \
-  GS_FOR_EACH_OP(T,DEFINE_GATHER) \
-  DEFINE_SCATTER(T) \
-  DEFINE_INIT(T)
-
-GS_FOR_EACH_DOMAIN(DEFINE_PROCS)
-
-#undef DEFINE_PROCS
-#undef DEFINE_INIT
-#undef DEFINE_SCATTER
-#undef DEFINE_GATHER
-
-/*------------------------------------------------------------------------------
-  The vector gather kernel
-------------------------------------------------------------------------------*/
-#define DEFINE_GATHER(T,OP) \
-static void gather_vec_##T##_##OP( \
-  T *restrict out, const T *restrict in, const unsigned vn,                  \
-  const uint *restrict map)                                                  \
-{                                                                            \
-  uint i,j;                                                                  \
-  while((i=*map++)!=UINT_MAX) {                                              \
-    T *restrict p = &out[i*vn], *pe = p+vn;                                  \
-    j=*map++; do {                                                           \
-      const T *restrict q = &in[j*vn];                                       \
-      T *restrict pk=p; do { GS_DO_##OP(*pk,*q); ++pk, ++q; } while(pk!=pe); \
-    } while((j=*map++)!=UINT_MAX);                                           \
-  }                                                                          \
-}
-
-/*------------------------------------------------------------------------------
-  The vector scatter kernel
-------------------------------------------------------------------------------*/
-void gs_scatter_vec(
-  void *restrict out, const void *restrict in, const unsigned vn,
-  const uint *restrict map, gs_dom dom)
-{
-  unsigned unit_size = vn*gs_dom_size[dom];
-  uint i,j;
-  while((i=*map++)!=UINT_MAX) {
-    const char *t = (const char *)in + i*unit_size;
-    j=*map++; do
-      memcpy((char *)out+j*unit_size,t,unit_size);
-    while((j=*map++)!=UINT_MAX);
-  }
-}
-
-/*------------------------------------------------------------------------------
-  The vector initialization kernel
-------------------------------------------------------------------------------*/
-#define DEFINE_INIT(T) \
-static void init_vec_##T(T *restrict out, const unsigned vn, \
-                         const uint *restrict map, gs_op op) \
-{                                                            \
-  uint i; const T e = gs_identity_##T[op];                   \
-  while((i=*map++)!=UINT_MAX) {                              \
-    T *restrict u = (T*)out + vn*i, *ue = u+vn;              \
-    do *u++ = e; while(u!=ue);                               \
-  }                                                          \
-}
-
-#define DEFINE_PROCS(T) \
-  GS_FOR_EACH_OP(T,DEFINE_GATHER) \
-  DEFINE_INIT(T)
-
-GS_FOR_EACH_DOMAIN(DEFINE_PROCS)
-
-#undef DEFINE_PROCS
-#undef DEFINE_INIT
-#undef DEFINE_GATHER
-
-#undef DO_bpr
-#undef DO_max
-#undef DO_min
-#undef DO_mul
-#undef DO_add
-
-#define SWITCH_DOMAIN_CASE(T) case gs_##T: WITH_DOMAIN(T); break;
-#define SWITCH_DOMAIN(dom) do switch(dom) { \
-    GS_FOR_EACH_DOMAIN(SWITCH_DOMAIN_CASE) case gs_dom_n: break; } while(0)
-
-#define SWITCH_OP_CASE(T,OP) case gs_##OP: WITH_OP(T,OP); break;
-#define SWITCH_OP(T,op) do switch(op) { \
-    GS_FOR_EACH_OP(T,SWITCH_OP_CASE) case gs_op_n: break; } while(0)
-
-/*------------------------------------------------------------------------------
-  Array kernels
-------------------------------------------------------------------------------*/
-void gs_gather_array(void *out, const void *in, uint n, gs_dom dom, gs_op op)
-{
-#define WITH_OP(T,OP) gather_array_##T##_##OP(out,in,n)
-#define WITH_DOMAIN(T) SWITCH_OP(T,op)
-  SWITCH_DOMAIN(dom);
-#undef  WITH_DOMAIN
-#undef  WITH_OP
-}
-
-void gs_init_array(void *out, uint n, gs_dom dom, gs_op op)
-{
-#define WITH_DOMAIN(T) init_array_##T(out,n,op)
-  SWITCH_DOMAIN(dom);
-#undef  WITH_DOMAIN
-}
-
-/*------------------------------------------------------------------------------
-  Plain kernels; vn parameter ignored but present for consistent signatures
-------------------------------------------------------------------------------*/
-void gs_gather(void *out, const void *in, const unsigned vn,
-               const uint *map, gs_dom dom, gs_op op)
-{
-#define WITH_OP(T,OP) gather_##T##_##OP(out,in,1,map)
-#define WITH_DOMAIN(T) SWITCH_OP(T,op)
-  SWITCH_DOMAIN(dom);
-#undef  WITH_DOMAIN
-#undef  WITH_OP
-}
-
-void gs_scatter(void *out, const void *in, const unsigned vn,
-                const uint *map, gs_dom dom)
-{
-#define WITH_DOMAIN(T) scatter_##T(out,1,in,1,map)
-  SWITCH_DOMAIN(dom);
-#undef  WITH_DOMAIN
-}
-
-void gs_init(void *out, const unsigned vn, const uint *map,
-             gs_dom dom, gs_op op)
-{
-#define WITH_DOMAIN(T) init_##T(out,map,op)
-  SWITCH_DOMAIN(dom);
-#undef  WITH_DOMAIN
-}
-
-/*------------------------------------------------------------------------------
-  Vector kernels
-------------------------------------------------------------------------------*/
-void gs_gather_vec(void *out, const void *in, const unsigned vn,
-                   const uint *map, gs_dom dom, gs_op op)
-{
-#define WITH_OP(T,OP) gather_vec_##T##_##OP(out,in,vn,map)
-#define WITH_DOMAIN(T) SWITCH_OP(T,op)
-  SWITCH_DOMAIN(dom);
-#undef  WITH_DOMAIN
-#undef  WITH_OP
-}
-
-void gs_init_vec(void *out, const unsigned vn, const uint *map,
-                 gs_dom dom, gs_op op)
-{
-#define WITH_DOMAIN(T) init_vec_##T(out,vn,map,op)
-  SWITCH_DOMAIN(dom);
-#undef  WITH_DOMAIN
-}
-
-/*------------------------------------------------------------------------------
-  Multiple array kernels
-------------------------------------------------------------------------------*/
-void gs_gather_many(void *out, const void *in, const unsigned vn,
-                    const uint *map, gs_dom dom, gs_op op)
-{
-  uint k;
-  typedef void *ptr_to_void; typedef const void *ptr_to_const_void;
-  const ptr_to_void *p = out; const ptr_to_const_void *q = in;
-#define WITH_OP(T,OP) for(k=0;k<vn;++k) gather_##T##_##OP(p[k],q[k],1,map)
-#define WITH_DOMAIN(T) SWITCH_OP(T,op)
-  SWITCH_DOMAIN(dom);
-#undef  WITH_DOMAIN
-#undef  WITH_OP
-}
-
-void gs_scatter_many(void *out, const void *in, const unsigned vn,
-                     const uint *map, gs_dom dom)
-{
-  uint k;
-  typedef void *ptr_to_void; typedef const void *ptr_to_const_void;
-  const ptr_to_void *p = out; const ptr_to_const_void *q = in;
-#define WITH_DOMAIN(T) for(k=0;k<vn;++k) scatter_##T(p[k],1,q[k],1,map)
-  SWITCH_DOMAIN(dom);
-#undef  WITH_DOMAIN
-}
-
-void gs_init_many(void *out, const unsigned vn, const uint *map,
-                  gs_dom dom, gs_op op)
-{
-  uint k;
-  typedef void *ptr_to_void; const ptr_to_void *p = out;
-#define WITH_DOMAIN(T) for(k=0;k<vn;++k) init_##T(p[k],map,op)
-  SWITCH_DOMAIN(dom);
-#undef  WITH_DOMAIN
-}
-
-/*------------------------------------------------------------------------------
-  Gather from strided array -> multiple arrays
-  Scatter from multiple arrays -> strided array,
-  Scatter from strided array -> multiple arrays,
-------------------------------------------------------------------------------*/
-void gs_gather_vec_to_many(void *out, const void *in, const unsigned vn,
-                           const uint *map, gs_dom dom, gs_op op)
-{
-  unsigned i; const unsigned unit_size = gs_dom_size[dom];
-  typedef void *ptr_to_void;
-  const ptr_to_void *p = out; const char *q = in;
-#define WITH_OP(T,OP) \
-  for(i=vn;i;--i) gather_##T##_##OP(*p++,(const T*)q,vn,map), q+=unit_size
-#define WITH_DOMAIN(T) SWITCH_OP(T,op)
-  SWITCH_DOMAIN(dom);
-#undef  WITH_DOMAIN
-#undef  WITH_OP
-}
-
-void gs_scatter_many_to_vec(void *out, const void *in, const unsigned vn,
-                            const uint *map, gs_dom dom)
-{
-  unsigned i; const unsigned unit_size = gs_dom_size[dom];
-  typedef const void *ptr_to_const_void;
-  char *p = out; const ptr_to_const_void *q = in;
-#define WITH_DOMAIN(T) \
-  for(i=vn;i;--i) scatter_##T((T*)p,vn,*q++,1,map), p+=unit_size
-  SWITCH_DOMAIN(dom);
-#undef  WITH_DOMAIN
-}
-
-void gs_scatter_vec_to_many(void *out, const void *in, const unsigned vn,
-                            const uint *map, gs_dom dom)
-{
-  unsigned i; const unsigned unit_size = gs_dom_size[dom];
-  typedef void *ptr_to_void;
-  const ptr_to_void *p = out; const char *q = in;
-#define WITH_DOMAIN(T) \
-  for(i=vn;i;--i) scatter_##T(*p++,1,(const T*)q,vn,map), q+=unit_size
-  SWITCH_DOMAIN(dom);
-#undef  WITH_DOMAIN
-}
-
-#undef SWITCH_OP
-#undef SWITCH_OP_CASE
-#undef SWITCH_DOMAIN
-#undef SWITCH_DOMAIN_CASE
diff --git a/3rdParty/gslib/src/gs_local.h b/3rdParty/gslib/src/gs_local.h
deleted file mode 100644
index fc7c41499..000000000
--- a/3rdParty/gslib/src/gs_local.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef GS_LOCAL_H
-#define GS_LOCAL_H
-
-#if !defined(NAME_H) || !defined(TYPES_H) || !defined(GS_DEFS_H)
-#warning "gs_local.h" requires "name.h", "types.h", and "gs_defs.h"
-#endif
-
-#define gs_gather_array        PREFIXED_NAME(gs_gather_array       )
-#define gs_init_array          PREFIXED_NAME(gs_init_array         )
-#define gs_gather              PREFIXED_NAME(gs_gather             )
-#define gs_scatter             PREFIXED_NAME(gs_scatter            )
-#define gs_init                PREFIXED_NAME(gs_init               )
-#define gs_gather_vec          PREFIXED_NAME(gs_gather_vec         )
-#define gs_scatter_vec         PREFIXED_NAME(gs_scatter_vec        )
-#define gs_init_vec            PREFIXED_NAME(gs_init_vec           )
-#define gs_gather_many         PREFIXED_NAME(gs_gather_many        )
-#define gs_scatter_many        PREFIXED_NAME(gs_scatter_many       )
-#define gs_init_many           PREFIXED_NAME(gs_init_many          )
-#define gs_gather_vec_to_many  PREFIXED_NAME(gs_gather_vec_to_many )
-#define gs_scatter_many_to_vec PREFIXED_NAME(gs_scatter_many_to_vec)
-#define gs_scatter_vec_to_many PREFIXED_NAME(gs_scatter_vec_to_many)
-
-void gs_gather_array(void *out, const void *in, uint n,
-                     gs_dom dom, gs_op op);
-void gs_init_array(void *out, uint n, gs_dom dom, gs_op op);
-
-typedef void gs_gather_fun(
-  void *out, const void *in, const unsigned vn,
-  const uint *map, gs_dom dom, gs_op op);
-typedef void gs_scatter_fun(
-  void *out, const void *in, const unsigned vn,
-  const uint *map, gs_dom dom);
-typedef void gs_init_fun(
-  void *out, const unsigned vn,
-  const uint *map, gs_dom dom, gs_op op);
-
-extern gs_gather_fun gs_gather, gs_gather_vec, gs_gather_many,
-                     gs_gather_vec_to_many;
-extern gs_scatter_fun gs_scatter, gs_scatter_vec, gs_scatter_many,
-                      gs_scatter_many_to_vec, gs_scatter_vec_to_many;
-extern gs_init_fun gs_init, gs_init_vec, gs_init_many;
-
-#endif
diff --git a/3rdParty/gslib/src/gslib.h b/3rdParty/gslib/src/gslib.h
deleted file mode 100644
index 2b1956838..000000000
--- a/3rdParty/gslib/src/gslib.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#define UNDERSCORE 1
-#define USE_NAIVE_BLAS 
-#define NO_NEX_EXITT 1
-#define GLOBAL_LONG_LONG 1
-
-#define MPI 1
-
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include "c99.h"
-#include "name.h"
-#include "fail.h"
-#include "types.h"
-#include "comm.h"
-#include "mem.h"
-#include "gs_defs.h"
-#include "gs.h"
-#include "crs.h"
diff --git a/3rdParty/gslib/src/lob_bnd.c b/3rdParty/gslib/src/lob_bnd.c
deleted file mode 100644
index d81a9a063..000000000
--- a/3rdParty/gslib/src/lob_bnd.c
+++ /dev/null
@@ -1,285 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>    /* for cos, fabs */
-#include <float.h>
-#include "c99.h"
-#include "name.h"
-#include "types.h"
-#include "fail.h"
-#include "mem.h"
-#include "poly.h"
-
-#define lob_bnd_setup  PREFIXED_NAME(lob_bnd_setup)
-#define lob_bnd_lin_1  PREFIXED_NAME(lob_bnd_lin_1)
-#define lob_bnd_lin_2  PREFIXED_NAME(lob_bnd_lin_2)
-#define lob_bnd_lin_3  PREFIXED_NAME(lob_bnd_lin_3)
-#define lob_bnd_1      PREFIXED_NAME(lob_bnd_1    )
-#define lob_bnd_2      PREFIXED_NAME(lob_bnd_2    )
-#define lob_bnd_3      PREFIXED_NAME(lob_bnd_3    )
-
-struct dbl_range { double min,max; };
-
-/*--------------------------------------------------------------------------
-   Bounds for Polynomials on [-1,1]^d
-     given in the Lagrangian basis on
-     Gauss-Lobatto-Legendre quadrature nodes
-
-   The main parameters are the number of GLL nodes in each dimension
-     unsigned nr = ..., ns = ..., nt = ...;
-
-   The number of points in the constructed piecewise (tri-,bi-)linear bounds
-   is a parameter; more points give tighter bounds, and we expect m>n.
-
-     unsigned mr = 4*nr, ms = 4*ns, mt = 4*nt;
-
-   The necessary setup is accomplished via:
-     double *data_r = tmalloc(double, lob_bnd_size(nr,mr));
-     double *data_s = tmalloc(double, lob_bnd_size(ns,ms));
-     double *data_t = tmalloc(double, lob_bnd_size(nt,mt));
-     lob_bnd_setup(data_r, nr,mr);
-     lob_bnd_setup(data_s, ns,ms);
-     lob_bnd_setup(data_t, nt,mt);
-
-   Bounds may then be computed via:
-     double work1r[2*mr], work1s[2*ms];
-     double work2[2*mr*(ns+ms+1)];
-     double work3[2*mr*ms*(nt+mt+1)];
-     double ur[nr], us[ns];    // 1-d polynomials on the zr[] and zs[] nodes
-     double u2[ns][nr];        // 2-d polynomial on zr[] (x) zs[]
-     double u3[nt][ns][nr];    // 3-d polynomial on zr[] (x) zs[] (x) zt[]
-     struct dbl_range bound;
-
-     bound = lob_bnd_1(data_r,nr,mr, ur, work1r); // compute bounds on ur
-     bound = lob_bnd_1(data_s,ns,ms, us, work1s); // compute bounds on us
-     bound = lob_bnd_2(data_r,nr,mr, data_s,ns,ms,
-               (const double*)&u2[0][0], work2); // compute bounds on u2
-     bound = lob_bnd_3(data_r,nr,mr, data_s,ns,ms, data_t,nt,mt,
-               (const double*)&u3[0][0], work3); // compute bounds on u3
-
-    free(data_r), free(data_s), free(data_t);
-
-   The functions lob_bnd_lin_d compute the piecewise d-linear bounds.
-   Nodes for these are Chebyshev-Lobatto:
-     h[0] = -1, h[m-1] = 1;
-     for(j=1;j<m-1;++j) h[j] = cos((m-1-j)*PI/(m-1));
-   The functions lob_bnd_d simply call these and return the min and max
-   over all nodes.
-
-  --------------------------------------------------------------------------*/
-
-#define PI 3.1415926535897932384626433832795028841971693993751058209749445923
-
-void lob_bnd_setup(double *restrict data, unsigned n, unsigned m)
-{{
-  unsigned nm = n*m, i,j;
-  double *restrict z=data,
-         *restrict Q=z+n, *restrict h=Q+2*n,
-         *restrict lb=h+m, *restrict lbnp=lb+2*nm;
-  double *restrict pl = tmalloc(double,5*n + gll_lag_size(n)),
-         *restrict dl = pl+n, *restrict pr=dl+n, *restrict dr=pr+n,
-         *restrict p=dr+n, *restrict gll_data=p+n;
-  lagrange_fun *lag = gll_lag_setup(gll_data,n);
-
-  /* set z and Q to Lobatto nodes, weights */
-  lobatto_quad(z,Q,n);
-
-  /* Q0, Q1 : linear functionals on the GLL nodal basis
-              for the zeroth and first Legendre coefficient */
-  for(i=n;i;) --i, Q[2*i]=Q[i]/2, Q[2*i+1] = 3*Q[2*i]*z[i];
-  /*for(i=0;i<n;++i) Q0[i]=Q0[i]/2, Q1[i] = 3*Q0[i]*z[i];*/
-
-  /* h : m Chebyshev nodes */
-  h[0] = -1, h[m-1] = 1;
-  for(j=1;j<m-1;++j) h[j] = cos((m-1-j)*PI/(m-1));
-
-  /* lv, uv : lower, upper piecewise linear bounds (nodes h) of
-              Lagrangian basis functions for Gauss-Lobatto nodes */
-  for(i=0;i<n;++i)
-    lb[(i*m+  0)*2+1]=lb[(i*m+  0)*2+0]=(i==  0?1:0),
-    lb[(i*m+m-1)*2+1]=lb[(i*m+m-1)*2+0]=(i==n-1?1:0);
-
-  lag(pl,gll_data,n,1,(h[0]+h[1])/2);
-  for(j=1;j<m-1;++j) {
-    double x = h[j], xl = (x+h[j-1])/2, xr = (x+h[j+1])/2;
-    lag(pr,gll_data,n,1,xr);
-    lag(p ,gll_data,n,0,x );
-    for(i=0;i<n;++i) {
-      double lo,up, cl = pl[i] + (x-xl)*dl[i], cr = pr[i] + (x-xr)*dr[i];
-      if(cl<cr) lo=cl,up=cr; else lo=cr,up=cl;
-      if(p[i]<lo) lo=p[i];
-      if(up<p[i]) up=p[i];
-      lb[(i*m+j)*2+0] = lo, lb[(i*m+j)*2+1] = up;
-    }
-    memcpy(pl,pr,2*n*sizeof(double));
-  }
-
-  /* lbnp : lb split into negative and positive parts */
-  for(i=0;i<nm;++i) {
-    double f;
-    lbnp[4*i+0]=lbnp[4*i+1]=lbnp[4*i+2]=lbnp[4*i+3]=0;
-    if((f=lb[2*i+0])<0) lbnp[4*i+0]=f; else lbnp[4*i+1]=f;
-    if((f=lb[2*i+1])<0) lbnp[4*i+2]=f; else lbnp[4*i+3]=f;
-  }
-
-  free(pl);
-}}
-
-static void lob_bnd_fst(
-  double *restrict b,
-  const double *restrict z, const double *restrict Q, const double *restrict h,
-  const double *restrict lb, unsigned n, unsigned m,
-  const double *restrict u)
-{
-  unsigned i,j;
-  double a0=0, a1=0;
-  for(i=0;i<n;++i) a0 += Q[2*i]*u[i], a1 += Q[2*i+1]*u[i];
-  for(j=0;j<m;++j) b[2*j+1] = b[2*j+0] = a0 + a1*h[j];
-  for(i=0;i<n;++i) {
-    double w = u[i] - (a0 + a1*z[i]);
-    if(w>=0)
-      for(j=0;j<m;++j) b[2*j+0]+=w*lb[0], b[2*j+1]+=w*lb[1], lb+=2;
-    else
-      for(j=0;j<m;++j) b[2*j+0]+=w*lb[1], b[2*j+1]+=w*lb[0], lb+=2;
-  }
-}
-
-static void lob_bnd_ext(
-  double *restrict b_,
-  const double *restrict z, const double *restrict Q, const double *restrict h,
-  const double *restrict lbnp, unsigned n, unsigned m,
-  const double *restrict br_, unsigned mr,
-  double *restrict a)
-{
-  const double *restrict br = br_;
-  double *restrict b = b_;
-  unsigned i,j,k;
-  for(i=0;i<mr;++i) a[2*i+1]=a[2*i+0]=0;
-  for(j=0;j<n;++j) {
-    double t, q0 = Q[2*j], q1 = Q[2*j+1];
-    for(i=0;i<mr;++i) t=(br[0]+br[1])/2, br+=2, a[2*i]+=q0*t, a[2*i+1]+=q1*t;
-  }
-  for(i=0;i<mr;++i) {
-    double a0=a[2*i],a1=a[2*i+1];
-    for(k=0;k<m;++k) b[1]=b[0]=a0+a1*h[k], b+=2;
-  }
-  br=br_;
-  for(j=0;j<n;++j,lbnp+=4*m) {
-    double zj = z[j];
-    b = b_;
-    for(i=0;i<mr;++i) {
-      double t = a[2*i] + a[2*i+1]*zj;
-      double w0 = *br++ - t;
-      double w1 = *br++ - t;
-      if(w0>=0)      /* 0  <= w0 <= w1 */
-        for(k=0;k<m;++k)
-          *b++ += w0 * lbnp[4*k+1] + w1 * lbnp[4*k+0],
-          *b++ += w1 * lbnp[4*k+3] + w0 * lbnp[4*k+2];
-      else if(w1<=0) /* w0 <= w1 <= 0  */
-        for(k=0;k<m;++k)
-          *b++ += w0 * lbnp[4*k+3] + w1 * lbnp[4*k+2],
-          *b++ += w1 * lbnp[4*k+1] + w0 * lbnp[4*k+0];
-      else           /* w0 <  0  <  w1 */
-        for(k=0;k<m;++k)
-          *b++ += w0 * lbnp[4*k+3] + w1 * lbnp[4*k+0],
-          *b++ += w1 * lbnp[4*k+3] + w0 * lbnp[4*k+0];
-    }
-  }
-}
-
-void lob_bnd_lin_1(double *restrict b,
-                   const double *restrict lob_bnd_data, unsigned n, unsigned m,
-                   const double *restrict u, uint un)
-{
-  const double *z=lob_bnd_data, *Q=z+n, *h=Q+2*n, *lb=h+m;
-  for(;un;--un, u+=n, b+=2*m) lob_bnd_fst(b, z,Q,h,lb,n,m, u);
-}
-
-/* work holds 2*mr + 2*ns*mr doubles */
-void lob_bnd_lin_2(
-  double *restrict b,
-  const double *lob_bnd_data_r, unsigned nr, unsigned mr,
-  const double *lob_bnd_data_s, unsigned ns, unsigned ms,
-  const double *restrict u, uint un, double *restrict work)
-{
-  unsigned mrs = mr*ms;
-  const double *zr=lob_bnd_data_r,*Qr=zr+nr,*hr=Qr+2*nr,*lb_r=hr+mr;
-  const double *zs=lob_bnd_data_s,*Qs=zs+ns,*hs=Qs+2*ns,*lbnp_s=hs+ms+2*ns*ms;
-  double *a = work, *br = a+2*mr;
-  for(;un;--un, b+=2*mrs) {
-    double *br_; unsigned i;
-    for(i=0,br_=br;i<ns;++i,br_+=2*mr,u+=nr)
-      lob_bnd_fst(br_, zr,Qr,hr,lb_r,nr,mr, u);
-    lob_bnd_ext(b, zs,Qs,hs,lbnp_s,ns,ms, br,mr, a);
-  }
-}
-
-/* work holds 2*mr*ms + 2*nt*ms*mr doubles */
-void lob_bnd_lin_3(
-  double *restrict b,
-  const double *lob_bnd_data_r, unsigned nr, unsigned mr,
-  const double *lob_bnd_data_s, unsigned ns, unsigned ms,
-  const double *lob_bnd_data_t, unsigned nt, unsigned mt,
-  const double *restrict u, uint un, double *restrict work)
-{
-  unsigned nst=ns*nt, mrst=mr*ms*mt, mrs=mr*ms, mr_ns=mr*ns;
-  const double *zr=lob_bnd_data_r,*Qr=zr+nr,*hr=Qr+2*nr,*lb_r=hr+mr;
-  const double *zs=lob_bnd_data_s,*Qs=zs+ns,*hs=Qs+2*ns,*lbnp_s=hs+ms+2*ns*ms;
-  const double *zt=lob_bnd_data_t,*Qt=zt+nt,*ht=Qt+2*nt,*lbnp_t=ht+mt+2*nt*mt;
-  double *a = work, *bs = a+2*mr*ms;
-  for(;un;--un, b+=2*mrst) {
-    double *br_, *bs_; unsigned i;
-    for(i=0,br_=b;i<nst;++i,br_+=2*mr,u+=nr)
-      lob_bnd_fst(br_, zr,Qr,hr,lb_r,nr,mr, u);
-    for(i=0,br_=b,bs_=bs;i<nt;++i,br_+=2*mr_ns,bs_+=2*mrs)
-      lob_bnd_ext(bs_, zs,Qs,hs,lbnp_s,ns,ms, br_,mr, a);
-    lob_bnd_ext(b, zt,Qt,ht,lbnp_t,nt,mt, bs,mrs, a);
-  }
-}
-
-static struct dbl_range minmax(const double *restrict b, unsigned m)
-{
-  struct dbl_range bnd;
-  bnd.min = b[0], bnd.max = b[1];
-  for(--m,b+=2;m;--m,b+=2)
-    bnd.min = b[0]<bnd.min?b[0]:bnd.min,
-    bnd.max = b[1]>bnd.max?b[1]:bnd.max;
-  return bnd;
-}
-
-/* work holds 2*m doubles */
-struct dbl_range lob_bnd_1(
-  const double *restrict lob_bnd_data, unsigned n, unsigned m,
-  const double *restrict u, double *restrict work)
-{
-  lob_bnd_lin_1(work, lob_bnd_data,n,m, u,1);
-  return minmax(work,m);
-}
-
-/* work holds 2*mr*ms + 2*mr + 2*mr*ns
-             =2*mr*(ms+1+ns) doubles */
-struct dbl_range lob_bnd_2(
-  const double *lob_bnd_data_r, unsigned nr, unsigned mr,
-  const double *lob_bnd_data_s, unsigned ns, unsigned ms,
-  const double *restrict u, double *restrict work)
-{
-  unsigned m = mr*ms;
-  lob_bnd_lin_2(work, lob_bnd_data_r,nr,mr,
-                      lob_bnd_data_s,ns,ms, u,1, work+2*m);
-  return minmax(work,m);
-}
-
-/* work holds 2*mr*ms*mt + 2*mr*ms + 2*nt*ms*mr
-             =2*mr*ms*(nt+mt+1) doubles */
-struct dbl_range lob_bnd_3(
-  const double *lob_bnd_data_r, unsigned nr, unsigned mr,
-  const double *lob_bnd_data_s, unsigned ns, unsigned ms,
-  const double *lob_bnd_data_t, unsigned nt, unsigned mt,
-  const double *restrict u, double *restrict work)
-{
-  unsigned m = mr*ms*mt;
-  lob_bnd_lin_3(work, lob_bnd_data_r,nr,mr,
-                      lob_bnd_data_s,ns,ms,
-                      lob_bnd_data_t,nt,mt, u,1, work+2*m);
-  return minmax(work,m);
-}
diff --git a/3rdParty/gslib/src/lob_bnd.h b/3rdParty/gslib/src/lob_bnd.h
deleted file mode 100644
index b47256b3a..000000000
--- a/3rdParty/gslib/src/lob_bnd.h
+++ /dev/null
@@ -1,111 +0,0 @@
-#ifndef LOB_BND_H
-#define LOB_BND_H
-
-#if !defined(TYPES_H) || !defined(NAME_H)
-#warning "lob_bnd.h" requires "types.h" and "name.h"
-#endif
-
-#define lob_bnd_setup  PREFIXED_NAME(lob_bnd_setup)
-#define lob_bnd_lin_1  PREFIXED_NAME(lob_bnd_lin_1)
-#define lob_bnd_lin_2  PREFIXED_NAME(lob_bnd_lin_2)
-#define lob_bnd_lin_3  PREFIXED_NAME(lob_bnd_lin_3)
-#define lob_bnd_1      PREFIXED_NAME(lob_bnd_1    )
-#define lob_bnd_2      PREFIXED_NAME(lob_bnd_2    )
-#define lob_bnd_3      PREFIXED_NAME(lob_bnd_3    )
-
-/*--------------------------------------------------------------------------
-   Bounds for Polynomials on [-1,1]^d
-     given in the Lagrangian basis on
-     Gauss-Lobatto-Legendre quadrature nodes
-
-   The main parameters are the number of GLL nodes in each dimension
-     unsigned nr = ..., ns = ..., nt = ...;
-
-   The number of points in the constructed piecewise (tri-,bi-)linear bounds
-   is a parameter; more points give tighter bounds, and we expect m>n.
-   
-     unsigned mr = 4*nr, ms = 4*ns, mt = 4*nt;
-   
-   The necessary setup is accomplished via:
-     double *data_r = tmalloc(double, lob_bnd_size(nr,mr));
-     double *data_s = tmalloc(double, lob_bnd_size(ns,ms));
-     double *data_t = tmalloc(double, lob_bnd_size(nt,mt));
-     lob_bnd_setup(data_r, nr,mr);
-     lob_bnd_setup(data_s, ns,ms);
-     lob_bnd_setup(data_t, nt,mt);
- 
-   Bounds may then be computed via:
-     double work1r[2*mr], work1s[2*ms];
-     double work2[2*mr*(ns+ms+1)];
-     double work3[2*mr*ms*(nt+mt+1)];
-     double ur[nr], us[ns];    // 1-d polynomials on the zr[] and zs[] nodes
-     double u2[ns][nr];        // 2-d polynomial on zr[] (x) zs[]
-     double u3[nt][ns][nr];    // 3-d polynomial on zr[] (x) zs[] (x) zt[]
-     struct dbl_range bound;
-     
-     bound = lob_bnd_1(data_r,nr,mr, ur, work1r); // compute bounds on ur
-     bound = lob_bnd_1(data_s,ns,ms, us, work1s); // compute bounds on us
-     bound = lob_bnd_2(data_r,nr,mr, data_s,ns,ms,
-               (const double*)&u2[0][0], work2); // compute bounds on u2
-     bound = lob_bnd_3(data_r,nr,mr, data_s,ns,ms, data_t,nt,mt,
-               (const double*)&u3[0][0], work3); // compute bounds on u3
-
-    free(data_r), free(data_s), free(data_t);
-
-   The functions lob_bnd_lin_d compute the piecewise d-linear bounds.
-   Nodes for these are Chebyshev-Lobatto:
-     h[0] = -1, h[m-1] = 1;
-     for(j=1;j<m-1;++j) h[j] = cos((m-1-j)*PI/(m-1));
-   The functions lob_bnd_d simply call these and return the min and max
-   over all nodes.
-    
-  --------------------------------------------------------------------------*/
-
-static unsigned lob_bnd_size(unsigned n, unsigned m)
-{ return m+3*n*(2*m+1); }
-
-void lob_bnd_setup(double *restrict data, unsigned n, unsigned m);
-
-void lob_bnd_lin_1(double *restrict b,
-                   const double *restrict lob_bnd_data, unsigned n, unsigned m,
-                   const double *restrict u, uint un);
-
-/* work holds 2*mr + 2*ns*mr doubles */
-void lob_bnd_lin_2(
-  double *restrict b,
-  const double *lob_bnd_data_r, unsigned nr, unsigned mr,
-  const double *lob_bnd_data_s, unsigned ns, unsigned ms,
-  const double *restrict u, uint un, double *restrict work);
-
-/* work holds 2*mr*ms + 2*nt*ms*mr doubles */
-void lob_bnd_lin_3(
-  double *restrict b,
-  const double *lob_bnd_data_r, unsigned nr, unsigned mr,
-  const double *lob_bnd_data_s, unsigned ns, unsigned ms,
-  const double *lob_bnd_data_t, unsigned nt, unsigned mt,
-  const double *restrict u, uint un, double *restrict work);
-
-struct dbl_range { double min, max; };
-
-/* work holds 2*m doubles */
-struct dbl_range lob_bnd_1(
-  const double *restrict lob_bnd_data, unsigned n, unsigned m,
-  const double *restrict u, double *restrict work);
-
-/* work holds 2*mr*ms + 2*mr + 2*mr*ns
-             =2*mr*(ns+ms+1) doubles */
-struct dbl_range lob_bnd_2(
-  const double *lob_bnd_data_r, unsigned nr, unsigned mr,
-  const double *lob_bnd_data_s, unsigned ns, unsigned ms,
-  const double *restrict u, double *restrict work);
-
-/* work holds 2*mr*ms*mt + 2*mr*ms + 2*nt*ms*mr
-             =2*mr*ms*(nt+mt+1) doubles */
-struct dbl_range lob_bnd_3(
-  const double *lob_bnd_data_r, unsigned nr, unsigned mr,
-  const double *lob_bnd_data_s, unsigned ns, unsigned ms,
-  const double *lob_bnd_data_t, unsigned nt, unsigned mt,
-  const double *restrict u, double *restrict work);
-
-#endif
-
diff --git a/3rdParty/gslib/src/mem.h b/3rdParty/gslib/src/mem.h
deleted file mode 100644
index b84b27feb..000000000
--- a/3rdParty/gslib/src/mem.h
+++ /dev/null
@@ -1,168 +0,0 @@
-#ifndef MEM_H
-#define MEM_H
-
-/* requires:
-     <stddef.h> for size_t, offsetof
-     <stdlib.h> for malloc, calloc, realloc, free
-     <string.h> for memcpy
-     "c99.h"
-     "fail.h"
-*/
-
-#if !defined(C99_H) || !defined(FAIL_H)
-#error "mem.h" requires "c99.h" and "fail.h"
-#endif
-
-/*
-   All memory management goes through the wrappers defined in this
-   header. Diagnostics can be turned on with
-     -DPRINT_MALLOCS=1
-   Then all memory management operations will be printed to stdout.
-
-   Most memory management occurs through use of the "array" type,
-   defined below, which defines a generic dynamically-sized array
-   that grows in bursts. The "buffer" type is a "char" array and
-   is often passed around by code to provide a common area for
-   scratch work.
-*/
-
-#ifndef PRINT_MALLOCS
-#  define PRINT_MALLOCS 0
-#else
-#  include <stdio.h>
-#  ifndef comm_gbl_id
-#    define comm_gbl_id PREFIXED_NAME(comm_gbl_id)
-#    define comm_gbl_np PREFIXED_NAME(comm_gbl_np)
-#    include "types.h"
-     extern uint comm_gbl_id, comm_gbl_np;
-#  endif
-#endif
-
-/*--------------------------------------------------------------------------
-   Memory Allocation Wrappers to Catch Out-of-memory
-  --------------------------------------------------------------------------*/
-
-static inline void *smalloc(size_t size, const char *file, unsigned line)
-{
-  void *restrict res = malloc(size);
-  #if PRINT_MALLOCS
-  fprintf(stdout,"MEM: proc %04d: %p = malloc(%ld) @ %s(%u)\n",
-          (int)comm_gbl_id,res,(long)size,file,line), fflush(stdout);
-  #endif
-  if(!res && size)
-    fail(1,file,line,"allocation of %ld bytes failed\n",(long)size);
-  return res;
-}
-
-static inline void *scalloc(
-  size_t nmemb, size_t size, const char *file, unsigned line)
-{
-  void *restrict res = calloc(nmemb, size);
-  #if PRINT_MALLOCS
-  fprintf(stdout,"MEM: proc %04d: %p = calloc(%ld) @ %s(%u)\n",
-          (int)comm_gbl_id,res,(long)size*nmemb,file,line), fflush(stdout);
-  #endif
-  if(!res && nmemb)
-    fail(1,file,line,"allocation of %ld bytes failed\n",
-           (long)size*nmemb);
-  return res;
-}
-
-static inline void *srealloc(
-  void *restrict ptr, size_t size, const char *file, unsigned line)
-{
-  void *restrict res = realloc(ptr, size);
-  #if PRINT_MALLOCS
-  if(res!=ptr) {
-    if(ptr)
-      fprintf(stdout,"MEM: proc %04d: %p freed by realloc @ %s(%u)\n",
-              (int)comm_gbl_id,ptr,file,line), fflush(stdout);
-    fprintf(stdout,"MEM: proc %04d: %p = realloc of %p to %lu @ %s(%u)\n",
-            (int)comm_gbl_id,res,ptr,(long)size,file,line), fflush(stdout);
-  } else
-    fprintf(stdout,"MEM: proc %04d: %p realloc'd to %lu @ %s(%u)\n",
-            (int)comm_gbl_id,res,(long)size,file,line), fflush(stdout);
-  #endif
-  if(!res && size)
-    fail(1,file,line,"allocation of %ld bytes failed\n",(long)size);
-  return res;
-}
-
-#define tmalloc(type, count) \
-  ((type*) smalloc((count)*sizeof(type),__FILE__,__LINE__) )
-#define tcalloc(type, count) \
-  ((type*) scalloc((count),sizeof(type),__FILE__,__LINE__) )
-#define trealloc(type, ptr, count) \
-  ((type*) srealloc((ptr),(count)*sizeof(type),__FILE__,__LINE__) )
-
-#if PRINT_MALLOCS
-static inline void sfree(void *restrict ptr, const char *file, unsigned line)
-{
-  free(ptr);
-  fprintf(stdout,"MEM: proc %04d: %p freed @ %s(%u)\n",
-          (int)comm_gbl_id,ptr,file,line), fflush(stdout);
-}
-#define free(x) sfree(x,__FILE__,__LINE__)
-#endif
-
-/*--------------------------------------------------------------------------
-   A dynamic array
-  --------------------------------------------------------------------------*/
-struct array { void *ptr; size_t n,max; };
-#define null_array {0,0,0}
-static void array_init_(struct array *a, size_t max, size_t size,
-                        const char *file, unsigned line)
-{
-  a->n=0, a->max=max, a->ptr=smalloc(max*size,file,line);
-}
-static void array_resize_(struct array *a, size_t max, size_t size,
-                          const char *file, unsigned line)
-{
-  a->max=max, a->ptr=srealloc(a->ptr,max*size,file,line);
-}
-static void *array_reserve_(struct array *a, size_t min, size_t size,
-                            const char *file, unsigned line)
-{
-  size_t max = a->max;
-  if(max<min) {
-    max+=max/2+1;
-    if(max<min) max=min;
-    array_resize_(a,max,size,file,line);
-  }
-  return a->ptr;
-}
-
-#define array_free(a) (free((a)->ptr))
-#define array_init(T,a,max) array_init_(a,max,sizeof(T),__FILE__,__LINE__)
-#define array_resize(T,a,max) array_resize_(a,max,sizeof(T),__FILE__,__LINE__)
-#define array_reserve(T,a,min) array_reserve_(a,min,sizeof(T),__FILE__,__LINE__)
-
-static void array_cat_(size_t size, struct array *d, const void *s, size_t n,
-                       const char *file, unsigned line)
-{
-  void *out = array_reserve_(d,d->n+n,size, file,line);
-  memcpy((char*)out+d->n*size, s, n*size);
-  d->n+=n;
-}
-
-#define array_cat(T,d,s,n) array_cat_(sizeof(T),d,s,n,__FILE__,__LINE__)
-
-/*--------------------------------------------------------------------------
-   Buffer = char array
-  --------------------------------------------------------------------------*/
-typedef struct array buffer;
-#define null_buffer null_array
-#define buffer_init(b,max) array_init(char,b,max)
-#define buffer_resize(b,max) array_resize(char,b,max)
-#define buffer_reserve(b,max) array_reserve(char,b,max)
-#define buffer_free(b) array_free(b)
-
-/*--------------------------------------------------------------------------
-   Alignment routines
-  --------------------------------------------------------------------------*/
-#define ALIGNOF(T) offsetof(struct { char c; T x; }, x)
-static size_t align_as_(size_t a, size_t n) { return (n+a-1)/a*a; }
-#define align_as(T,n) align_as_(ALIGNOF(T),n)
-#define align_ptr(T,base,offset) ((T*)((char*)(base)+align_as(T,offset)))
-#endif
-
diff --git a/3rdParty/gslib/src/name.h b/3rdParty/gslib/src/name.h
deleted file mode 100644
index b4bcd9169..000000000
--- a/3rdParty/gslib/src/name.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#ifndef NAME_H
-#define NAME_H
-
-/* establishes some macros to establish
-   * the FORTRAN naming convention
-     default      gs_setup, etc.
-     -DUPCASE     GS_SETUP, etc.
-     -DUNDERSCORE gs_setup_, etc.
-   * a prefix for all external (non-FORTRAN) function names
-     for example, -DPREFIX=jl_   transforms fail -> jl_fail
-   * a prefix for all external FORTRAN function names     
-     for example, -DFPREFIX=jlf_ transforms gs_setup_ -> jlf_gs_setup_
-*/
-
-/* the following macro functions like a##b,
-   but will expand a and/or b if they are themselves macros */
-#define TOKEN_PASTE_(a,b) a##b
-#define TOKEN_PASTE(a,b) TOKEN_PASTE_(a,b)
-
-#ifdef PREFIX
-#  define PREFIXED_NAME(x) TOKEN_PASTE(PREFIX,x)
-#else
-#  define PREFIXED_NAME(x) x
-#endif
-
-#ifdef FPREFIX
-#  define FPREFIXED_NAME(x) TOKEN_PASTE(FPREFIX,x)
-#else
-#  define FPREFIXED_NAME(x) x
-#endif
-
-#if defined(UPCASE)
-#  define FORTRAN_NAME(low,up) FPREFIXED_NAME(up)
-#  define FORTRAN_UNPREFIXED(low,up) up
-#elif defined(UNDERSCORE)
-#  define FORTRAN_NAME(low,up) FPREFIXED_NAME(TOKEN_PASTE(low,_))
-#  define FORTRAN_UNPREFIXED(low,up) TOKEN_PASTE(low,_)
-#else
-#  define FORTRAN_NAME(low,up) FPREFIXED_NAME(low)
-#  define FORTRAN_UNPREFIXED(low,up) low
-#endif
-
-#endif
-
diff --git a/3rdParty/gslib/src/obbox.c b/3rdParty/gslib/src/obbox.c
deleted file mode 100644
index 22c4614f3..000000000
--- a/3rdParty/gslib/src/obbox.c
+++ /dev/null
@@ -1,341 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <string.h>
-#include "c99.h"
-#include "name.h"
-#include "fail.h"
-#include "types.h"
-#include "mem.h"
-#include "tensor.h"
-#include "poly.h"
-#include "lob_bnd.h"
-
-#define obbox_calc_2  PREFIXED_NAME(obbox_calc_2)
-#define obbox_calc_3  PREFIXED_NAME(obbox_calc_3)
-
-struct obbox_2 { double c0[2], A[4];
-                 struct dbl_range x[2]; };
-
-struct obbox_3 { double c0[3], A[9];
-                 struct dbl_range x[3]; };
-
-
-static void copy_strided(double *out, const double *in,
-                         unsigned g, unsigned s, unsigned n)
-{
-  if(g==1) for(;n;--n,in+=s) *out++ = *in;
-  else {
-    s *= g;
-    for(;n;--n,in+=s) memcpy(out,in,g*sizeof(double)), out+=g;
-  }
-}
-
-static void mat_inv_2(double inv[4], const double A[4])
-{
-  const double idet = 1/(A[0]*A[3]-A[1]*A[2]);
-  inv[0] =   idet*A[3];
-  inv[1] = -(idet*A[1]);
-  inv[2] = -(idet*A[2]);
-  inv[3] =   idet*A[0];
-}
-
-static void mat_inv_3(double inv[9], const double A[9])
-{
-  const double a = A[4]*A[8]-A[5]*A[7],
-               b = A[5]*A[6]-A[3]*A[8],
-               c = A[3]*A[7]-A[4]*A[6],
-            idet = 1/(A[0]*a+A[1]*b+A[2]*c);
-  inv[0] = idet*a;
-  inv[1] = idet*(A[2]*A[7]-A[1]*A[8]);
-  inv[2] = idet*(A[1]*A[5]-A[2]*A[4]);
-  inv[3] = idet*b;
-  inv[4] = idet*(A[0]*A[8]-A[2]*A[6]);
-  inv[5] = idet*(A[2]*A[3]-A[0]*A[5]);
-  inv[6] = idet*c;
-  inv[7] = idet*(A[1]*A[6]-A[0]*A[7]);
-  inv[8] = idet*(A[0]*A[4]-A[1]*A[3]);
-}
-
-static struct dbl_range dbl_range_merge(struct dbl_range a, struct dbl_range b)
-{
-  struct dbl_range m;
-  m.min = b.min<a.min?b.min:a.min,
-  m.max = a.max>b.max?a.max:b.max;
-  return m;
-}
-
-static struct dbl_range dbl_range_expand(struct dbl_range b, double tol)
-{
-  double a = (b.min+b.max)/2, l = (b.max-b.min)*(1+tol)/2;
-  struct dbl_range m;
-  m.min = a-l, m.max = a+l;
-  return m;
-}
-
-static void bbox_2_tfm(double *out, const double x0[2], const double Ji[4],
-                       const double *x, const double *y, unsigned n)
-{
-  unsigned i;
-  for(i=0;i<n;++i) {
-    const double dx = x[i]-x0[0], dy = y[i]-x0[1];
-    out[  i] = Ji[0]*dx + Ji[1]*dy;
-    out[n+i] = Ji[2]*dx + Ji[3]*dy;
-  }
-}
-
-static void bbox_3_tfm(double *out, const double x0[3], const double Ji[9],
-                       const double *x, const double *y, const double *z,
-                       unsigned n)
-{
-  unsigned i;
-  for(i=0;i<n;++i) {
-    const double dx = x[i]-x0[0], dy = y[i]-x0[1], dz = z[i]-x0[2];
-    out[    i] = Ji[0]*dx + Ji[1]*dy + Ji[2]*dz;
-    out[  n+i] = Ji[3]*dx + Ji[4]*dy + Ji[5]*dz;
-    out[2*n+i] = Ji[6]*dx + Ji[7]*dy + Ji[8]*dz;
-  }
-}
-
-#if 0
-
-/* positive when possibly inside */
-double obbox_axis_test_2(const struct obbox_2 *const b,
-                         const double x, const double y)
-{
-  const double bx = (x-b->x[0].min)*(b->x[0].max-x);
-  return bx<0 ? bx : (y-b->x[1].min)*(b->x[1].max-y);
-}
-
-/* positive when possibly inside */
-double obbox_test_2(const struct obbox_2 *const b,
-                    const double x, const double y)
-{
-  const double bxy = obbox_axis_test_2(b,x,y);
-  if(bxy<0) return bxy; else {
-    const double dx = x-b->c0[0], dy = y-b->c0[1];
-    const double r = b->A[0]*dx + b->A[1]*dy,
-                 s = b->A[2]*dx + b->A[3]*dy;
-    const double br = (r+1)*(1-r);
-    return br<0 ? br : (s+1)*(1-s);
-  }
-}
-
-#endif
-
-#define DO_MAX(a,b) do { unsigned temp = b; if(temp>a) a=temp; } while(0)
-
-void obbox_calc_2(struct obbox_2 *out,
-                  const double *const elx[2],
-                  const unsigned n[2], uint nel,
-                  const unsigned m[2], const double tol)
-{
-  const double *x = elx[0], *y = elx[1];
-  const unsigned nr = n[0], ns = n[1];
-  const unsigned mr = m[0], ms = m[1];
-
-  const unsigned nrs = nr*ns;
-  double *data;
-  const unsigned lbsize0 = lob_bnd_size(nr,mr),
-                 lbsize1 = lob_bnd_size(ns,ms);
-  unsigned wsize = 4*ns+2*ms;
-  DO_MAX(wsize,2*nr+2*mr);
-  DO_MAX(wsize,gll_lag_size(nr));
-  DO_MAX(wsize,gll_lag_size(ns));
-  data = tmalloc(double, 2*(nr+ns)+lbsize0+lbsize1+wsize);
-
-  {
-    double *const I0r = data, *const I0s = data+2*nr;
-    double *const lob_bnd_data_r = data+2*(nr+ns),
-           *const lob_bnd_data_s = data+2*(nr+ns)+lbsize0;
-    double *const work = data+2*(nr+ns)+lbsize0+lbsize1;
-
-    #define SETUP_DIR(r) do { \
-      lagrange_fun *const lag = gll_lag_setup(work, n##r); \
-      lag(I0##r, work,n##r,1, 0); \
-      lob_bnd_setup(lob_bnd_data_##r, n##r,m##r); \
-    } while(0)
-    
-    SETUP_DIR(r); SETUP_DIR(s);
-    
-    #undef SETUP_DIR
-    
-    for(;nel;--nel,x+=nrs,y+=nrs,++out) {
-      double x0[2], J[4], Ji[4];
-      struct dbl_range ab[2], tb[2];
-  
-      /* double work[2*nr] */
-      x0[0] = tensor_ig2(J  , I0r,nr, I0s,ns, x, work);
-      x0[1] = tensor_ig2(J+2, I0r,nr, I0s,ns, y, work);
-      mat_inv_2(Ji, J);
-
-      /* double work[2*m##r] */
-      #define DO_BOUND(bnd,merge,r,x,work) do { \
-        struct dbl_range b = \
-        lob_bnd_1(lob_bnd_data_##r,n##r,m##r, x, work); \
-        if(merge) bnd=dbl_range_merge(bnd,b); else bnd=b; \
-      } while(0)
-
-      /* double work[2*n##r + 2*m##r] */
-      #define DO_EDGE(merge,r,x,y,work) do { \
-        DO_BOUND(ab[0],merge,r,x,work); \
-        DO_BOUND(ab[1],merge,r,y,work); \
-        bbox_2_tfm(work, x0,Ji, x,y,n##r); \
-        DO_BOUND(tb[0],merge,r,(work)     ,(work)+2*n##r); \
-        DO_BOUND(tb[1],merge,r,(work)+n##r,(work)+2*n##r); \
-      } while(0)
-
-      DO_EDGE(0,r,x,y,work);
-      DO_EDGE(1,r,&x[nrs-nr],&y[nrs-nr],work);
-
-      /* double work[4*ns + 2*ms] */
-      #define GET_EDGE(off) do { \
-        copy_strided(work   , x+off,1,nr,ns); \
-        copy_strided(work+ns, y+off,1,nr,ns); \
-        DO_EDGE(1,s,work,work+ns,work+2*ns); \
-      } while(0)
-  
-      GET_EDGE(0);
-      GET_EDGE(nr-1);
-  
-      #undef GET_EDGE
-      #undef DO_EDGE
-      #undef DO_BOUND
-
-      out->x[0] = dbl_range_expand(ab[0],tol),
-      out->x[1] = dbl_range_expand(ab[1],tol);
-  
-      {
-        const double av0=(tb[0].min+tb[0].max)/2, av1=(tb[1].min+tb[1].max)/2;
-        out->c0[0] = x0[0] + J[0]*av0 + J[1]*av1;
-        out->c0[1] = x0[1] + J[2]*av0 + J[3]*av1;
-      }
-      {
-        const double di0 = 2/((1+tol)*(tb[0].max-tb[0].min)),
-                     di1 = 2/((1+tol)*(tb[1].max-tb[1].min));
-        out->A[0]=di0*Ji[0], out->A[1]=di0*Ji[1];
-        out->A[2]=di1*Ji[2], out->A[3]=di1*Ji[3];
-      }
-
-    }
-  }
-  
-  free(data);  
-}
-
-void obbox_calc_3(struct obbox_3 *out,
-                  const double *const elx[3],
-                  const unsigned n[3], uint nel,
-                  const unsigned m[3], const double tol)
-{
-  const double *x = elx[0], *y = elx[1], *z = elx[2];
-  const unsigned nr = n[0], ns = n[1], nt = n[2];
-  const unsigned mr = m[0], ms = m[1], mt = m[2];
-
-  const unsigned nrs = nr*ns, nrst = nr*ns*nt;
-  double *data;
-  const unsigned lbsize0 = lob_bnd_size(nr,mr),
-                 lbsize1 = lob_bnd_size(ns,ms),
-                 lbsize2 = lob_bnd_size(nt,mt);
-  unsigned wsize = 3*nr*ns+2*mr*(ns+ms+1);
-  DO_MAX(wsize,6*nr*nt+2*mr*(nt+mt+1));
-  DO_MAX(wsize,6*ns*nt+2*ms*(nt+mt+1));
-  DO_MAX(wsize,2*nr*ns+3*nr);
-  DO_MAX(wsize,gll_lag_size(nr));
-  DO_MAX(wsize,gll_lag_size(ns));
-  DO_MAX(wsize,gll_lag_size(nt));
-  data = tmalloc(double, 2*(nr+ns+nt)+lbsize0+lbsize1+lbsize2+wsize);
-
-  {
-    double *const I0r = data, *const I0s = I0r+2*nr, *const I0t = I0s+2*ns;
-    double *const lob_bnd_data_r = data+2*(nr+ns+nt),
-           *const lob_bnd_data_s = data+2*(nr+ns+nt)+lbsize0,
-           *const lob_bnd_data_t = data+2*(nr+ns+nt)+lbsize0+lbsize1;
-    double *const work = data+2*(nr+ns+nt)+lbsize0+lbsize1+lbsize2;
-    
-    #define SETUP_DIR(r) do { \
-      lagrange_fun *const lag = gll_lag_setup(work, n##r); \
-      lag(I0##r, work,n##r,1, 0); \
-      lob_bnd_setup(lob_bnd_data_##r, n##r,m##r); \
-    } while(0)
-    
-    SETUP_DIR(r); SETUP_DIR(s); SETUP_DIR(t);
-    
-    #undef SETUP_DIR
-    
-    for(;nel;--nel,x+=nrst,y+=nrst,z+=nrst,++out) {
-      double x0[3], J[9], Ji[9];
-      struct dbl_range ab[3], tb[3];
-  
-      /* double work[2*nrs+3*nr] */
-      #define EVAL_AT_0(d,x) \
-        x0[d] = tensor_ig3(J+3*d, I0r,nr, I0s,ns, I0t,nt, x, work)
-      EVAL_AT_0(0,x); EVAL_AT_0(1,y); EVAL_AT_0(2,z);                          
-      mat_inv_3(Ji, J);
-      #undef EVAL_AT_0
- 
-      /* double work[2*m##r*(n##s+m##s+1)] */
-      #define DO_BOUND(bnd,merge,r,s,x,work) do { \
-        struct dbl_range b = \
-        lob_bnd_2(lob_bnd_data_##r,n##r,m##r, \
-                  lob_bnd_data_##s,n##s,m##s, x, work); \
-        if(merge) bnd=dbl_range_merge(bnd,b); else bnd=b; \
-      } while(0)
-
-      /* double work[3*n##r*n##s+2*m##r*(n##s+m##s+1)] */
-      #define DO_FACE(merge,r,s,x,y,z,work) do { \
-        DO_BOUND(ab[0],merge,r,s,x,work); \
-        DO_BOUND(ab[1],merge,r,s,y,work); \
-        DO_BOUND(ab[2],merge,r,s,z,work); \
-        bbox_3_tfm(work, x0,Ji, x,y,z,n##r*n##s); \
-        DO_BOUND(tb[0],merge,r,s,(work)            ,(work)+3*n##r*n##s); \
-        DO_BOUND(tb[1],merge,r,s,(work)+  n##r*n##s,(work)+3*n##r*n##s); \
-        DO_BOUND(tb[2],merge,r,s,(work)+2*n##r*n##s,(work)+3*n##r*n##s); \
-      } while(0)
-
-      DO_FACE(0,r,s,x,y,z,work);
-      DO_FACE(1,r,s,&x[nrst-nrs],&y[nrst-nrs],&z[nrst-nrs],work);
-
-      /* double work[6*n##r*n##s+2*m##r*(n##s+m##s+1)] */
-      #define GET_FACE(r,s,off,n1,n2,n3) do { \
-        copy_strided(work            , x+off,n1,n2,n3); \
-        copy_strided(work+  n##r*n##s, y+off,n1,n2,n3); \
-        copy_strided(work+2*n##r*n##s, z+off,n1,n2,n3); \
-        DO_FACE(1,r,s,work,work+n##r*n##s,work+2*n##r*n##s,work+3*n##r*n##s); \
-      } while(0)
-  
-      GET_FACE(r,t,0     ,nr,ns,nt);
-      GET_FACE(r,t,nrs-nr,nr,ns,nt);
-      GET_FACE(s,t,0     , 1,nr,ns*nt);
-      GET_FACE(s,t,nr-1  , 1,nr,ns*nt);
-      
-      #undef GET_FACE
-      #undef DO_FACE
-      #undef DO_BOUND
-
-      out->x[0] = dbl_range_expand(ab[0],tol),
-      out->x[1] = dbl_range_expand(ab[1],tol);
-      out->x[2] = dbl_range_expand(ab[2],tol);
-  
-      {
-        const double av0 = (tb[0].min+tb[0].max)/2,
-                     av1 = (tb[1].min+tb[1].max)/2,
-                     av2 = (tb[2].min+tb[2].max)/2;
-        out->c0[0] = x0[0] + J[0]*av0 + J[1]*av1 + J[2]*av2;
-        out->c0[1] = x0[1] + J[3]*av0 + J[4]*av1 + J[5]*av2;
-        out->c0[2] = x0[2] + J[6]*av0 + J[7]*av1 + J[8]*av2;
-      }
-      {
-        const double di0 = 2/((1+tol)*(tb[0].max-tb[0].min)),
-                     di1 = 2/((1+tol)*(tb[1].max-tb[1].min)),
-                     di2 = 2/((1+tol)*(tb[2].max-tb[2].min));
-        out->A[0]=di0*Ji[0], out->A[1]=di0*Ji[1], out->A[2]=di0*Ji[2];
-        out->A[3]=di1*Ji[3], out->A[4]=di1*Ji[4], out->A[5]=di1*Ji[5];
-        out->A[6]=di2*Ji[6], out->A[7]=di2*Ji[7], out->A[8]=di2*Ji[8];
-      }
-
-    }
-  }
-  
-  free(data);
-}
-
diff --git a/3rdParty/gslib/src/obbox.h b/3rdParty/gslib/src/obbox.h
deleted file mode 100644
index 8e5764fe2..000000000
--- a/3rdParty/gslib/src/obbox.h
+++ /dev/null
@@ -1,113 +0,0 @@
-#ifndef OBBOX_H
-#define OBBOX_H
-
-#if !defined(TYPES_H) || !defined(NAME_H)
-#warning "obbox.h" requires "types.h" and "name.h"
-#endif
-
-#define obbox_calc_2  PREFIXED_NAME(obbox_calc_2)
-#define obbox_calc_3  PREFIXED_NAME(obbox_calc_3)
-
-/*--------------------------------------------------------------------------
-   Oriented and axis-aligned bounding box computation for spectral elements
-   
-   Usage:
-   
-     double x[n][nt][ns][nr], y[n][nt][ns][nr], z[n][nt][ns][nr];
-     obbox_3 ob[n];
-
-     unsigned mr=4*nr, ms=4*ns, mt=4*nt;
-     double tol = 1e-6;
-     obbox_3_calc(ob, x,y,z, nr,ns,nt,n, mr,ms,mt, tol);
-     
-   The parameters mr,ms,mt specify number of points to use in computing
-   bounds (see lob_bnd.h). It is expected that mr>nr, etc. For reasonable
-   quality, a factor of at least 2 is recommended.
-  
-   tol is a relative amount by which to expand the bounding box.
-   This would accommodate, e.g., rounding errors.
-  
-   The axis aligned bounds for a given element are
-     ob[i].x.min <= x <= ob[i].x.max
-     ob[i].y.min <= y <= ob[i].y.max
-     ob[i].z.min <= z <= ob[i].z.max
-
-   The oriented bounding box is given by
-     (-1,-1,-1)^T <= ob[i].A * (x - ob[i].c0) <= (1,1,1)
-   
-   where the matrix is row-major format,
-     dx = x - c0[0], dy = y - c0[1], dz = z - c0[2]
-     -1 <= r[0] = A[0]*dx + A[1]*dy + A[2]*dz <= 1
-     -1 <= r[1] = A[3]*dx + A[4]*dy + A[5]*dz <= 1
-     -1 <= r[2] = A[6]*dx + A[7]*dy + A[8]*dz <= 1
-
-   Also, ob[i].A * (x - ob[i].c0) should be a reasonable seed for Newton's.
-    
-  --------------------------------------------------------------------------*/
-
-#ifndef LOB_BND_H
-struct dbl_range { double min, max; };
-#endif
-
-struct obbox_2 { double c0[2], A[4];
-                 struct dbl_range x[2]; };
-
-struct obbox_3 { double c0[3], A[9];
-                 struct dbl_range x[3]; };
-
-void obbox_calc_2(struct obbox_2 *out,
-                  const double *const elx[2],
-                  const unsigned n[2], uint nel,
-                  const unsigned m[2], const double tol);
-
-void obbox_calc_3(struct obbox_3 *out,
-                  const double *const elx[3],
-                  const unsigned n[3], uint nel,
-                  const unsigned m[3], const double tol);
-
-/* positive when possibly inside */
-static double obbox_axis_test_2(const struct obbox_2 *const b,
-                                const double x[2])
-{
-  const double bx =  (x[0]-b->x[0].min)*(b->x[0].max-x[0]);
-  return bx<0 ? bx : (x[1]-b->x[1].min)*(b->x[1].max-x[1]);
-}
-
-/* positive when possibly inside */
-static double obbox_test_2(const struct obbox_2 *const b, const double x[2])
-{
-  const double bxy = obbox_axis_test_2(b,x);
-  if(bxy<0) return bxy; else {
-    const double dx = x[0]-b->c0[0], dy = x[1]-b->c0[1];
-    const double r = b->A[0]*dx + b->A[1]*dy,
-                 s = b->A[2]*dx + b->A[3]*dy;
-    const double br = (r+1)*(1-r);
-    return br<0 ? br : (s+1)*(1-s);
-  }
-}
-
-/* positive when possibly inside */
-static double obbox_axis_test_3(const struct obbox_3 *const b,
-                                const double x[3])
-{
-  const double               bx = (x[0]-b->x[0].min)*(b->x[0].max-x[0]);
-  const double               by = (x[1]-b->x[1].min)*(b->x[1].max-x[1]);
-  return bx<0 ? bx : (by<0 ? by : (x[2]-b->x[2].min)*(b->x[2].max-x[2]));
-}
-
-/* positive when possibly inside */
-static double obbox_test_3(const struct obbox_3 *const b, const double x[3])
-{
-  const double bxyz = obbox_axis_test_3(b,x);
-  if(bxyz<0) return bxyz; else {
-    const double dx = x[0]-b->c0[0], dy = x[1]-b->c0[1], dz = x[2]-b->c0[2];
-    const double r = b->A[0]*dx + b->A[1]*dy + b->A[2]*dz,
-                 s = b->A[3]*dx + b->A[4]*dy + b->A[5]*dz,
-                 t = b->A[6]*dx + b->A[7]*dy + b->A[8]*dz;
-    const double br = (r+1)*(1-r), bs = (s+1)*(1-s);
-    return br<0 ? br : (bs<0 ? bs : (t+1)*(1-t));
-  }
-}
-
-#endif
-
diff --git a/3rdParty/gslib/src/poly.c b/3rdParty/gslib/src/poly.c
deleted file mode 100644
index 00ad22b11..000000000
--- a/3rdParty/gslib/src/poly.c
+++ /dev/null
@@ -1,236 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <math.h>    /* for cos, fabs */
-#include <float.h>
-#include <string.h>
-#include "c99.h"
-#include "name.h"
-#include "fail.h"
-#include "mem.h"
-
-#define lagrange_size  PREFIXED_NAME(lagrange_size )
-#define lagrange_setup PREFIXED_NAME(lagrange_setup)
-#define gauss_nodes    PREFIXED_NAME(gauss_nodes   )
-#define gauss_quad     PREFIXED_NAME(gauss_quad    )
-#define lobatto_nodes  PREFIXED_NAME(lobatto_nodes )
-#define lobatto_quad   PREFIXED_NAME(lobatto_quad  )
-#define gll_lag_size   PREFIXED_NAME(gll_lag_size  )
-#define gll_lag_setup  PREFIXED_NAME(gll_lag_setup )
-
-typedef void lagrange_fun(double *restrict p,
-  double *restrict data, unsigned n, int d, double x);
-
-#include "poly_imp.h"
-
-static void lagrange_eval(double *restrict p,
-                          double *restrict data, unsigned n, int der, double x)
-{{
-  unsigned i;
-  const double *restrict z=data, *restrict w=z+n;
-  double *restrict d=data+2*n, *restrict u0=d+n, *restrict v0=u0+n;
-  for(i=0;i<n;++i) d[i]=2*(x-z[i]);
-  u0[0  ]=1; for(i=0  ;i<n-1;++i) u0[i+1]=u0[i]*d[i];
-  v0[n-1]=1; for(i=n-1;i    ;--i) v0[i-1]=d[i]*v0[i];
-  for(i=0;i<n;++i) p[i]=w[i]*u0[i]*v0[i];
-  if(der>0) {
-    double *restrict p1 = p+n, *restrict u1=v0+n, *restrict v1=u1+n;
-    u1[0  ]=0; for(i=0  ;i<n-1;++i) u1[i+1]=u1[i]*d[i]+u0[i];
-    v1[n-1]=0; for(i=n-1;i    ;--i) v1[i-1]=d[i]*v1[i]+v0[i];
-    for(i=0;i<n;++i) p1[i]=2*w[i]*(u1[i]*v0[i]+u0[i]*v1[i]);
-    if(der>1) {
-      double *restrict p2 = p1+n, *restrict u2=v1+n, *restrict v2=u2+n;
-      u2[0  ]=0; for(i=0  ;i<n-1;++i) u2[i+1]=u2[i]*d[i]+2*u1[i];
-      v2[n-1]=0; for(i=n-1;i    ;--i) v2[i-1]=d[i]*v2[i]+2*v1[i];
-      for(i=0;i<n;++i)
-        p2[i]=4*w[i]*(u2[i]*v0[i]+2*u1[i]*v1[i]+u0[i]*v2[i]);
-    }
-  }
-}}
-
-static void lagrange_coef(
-  double *restrict p, double *data, double *w, const double *z,
-  unsigned n, lagrange_fun *lag_eval)
-{
-  unsigned i;
-  for(i=0;i<n;++i) w[i]=1;
-  for(i=0;i<n;++i) lag_eval(p,data,n,0,z[i]), w[i]=1/p[i];
-}
-
-unsigned lagrange_size(unsigned n)
-{
-  return 9*n;
-}
-
-lagrange_fun *lagrange_setup(
-  double *restrict data, const double *restrict z, unsigned n)
-{
-  double *restrict p = tmalloc(double,n);
-  memcpy(data,z,n*sizeof(double));
-  lagrange_coef(p,data,data+n,z,n,&lagrange_eval);
-  free(p);
-  return &lagrange_eval;
-}
-
-#define EPS   (128*DBL_EPSILON)
-#define PI 3.1415926535897932384626433832795028841971693993751058209749445923
-
-/* 
-  For brevity's sake, some names have been shortened
-  Quadrature rules
-    Gauss   -> Gauss-Legendre quadrature (open)
-    Lobatto -> Gauss-Lobatto-Legendre quadrature (closed at both ends)
-  Polynomial bases
-    Legendre -> Legendre basis
-    Gauss    -> Lagrangian basis using Gauss   quadrature nodes
-    Lobatto  -> Lagrangian basis using Lobatto quadrature nodes
-*/
-
-/*--------------------------------------------------------------------------
-   Legendre Polynomial Computation
-   compute P_n(x) or P_n'(x) or P_n''(x)
-  --------------------------------------------------------------------------*/
-
-/* precondition: n >= 0 */
-static double legendre(int n, double x)
-{
-  double p[2];
-  double i, nn=n-0.5; /* avoid int -> double conversions */
-  p[0]=1.,p[1]=x;
-  for(i=1; i<nn; i+=2) {
-    p[0] = ((2*i+1)*x*p[1]- i   *p[0])/(i+1);
-    p[1] = ((2*i+3)*x*p[0]-(i+1)*p[1])/(i+2);
-  }
-  return p[n&1];
-}
-
-/* precondition: n > 0 */
-static double legendre_d1(int n, double x)
-{
-  double p[2];
-  double i, nn=n-0.5; /* avoid int -> double conversions */
-  p[0]=3*x,p[1]=1;
-  for(i=2; i<nn; i+=2) {
-    p[1] = ((2*i+1)*x*p[0]-(i+1)*p[1])/i;
-    p[0] = ((2*i+3)*x*p[1]-(i+2)*p[0])/(i+1);
-  }
-  return p[n&1];
-}
-
-/* precondition: n > 1 */
-static double legendre_d2(int n, double x)
-{
-  double p[2];
-  double i, nn=n-0.5; /* avoid int -> double conversions */
-  p[0]=3,p[1]=15*x;
-  for(i=3; i<nn; i+=2) {
-    p[0] = ((2*i+1)*x*p[1]-(i+2)*p[0])/(i-1);
-    p[1] = ((2*i+3)*x*p[0]-(i+3)*p[1])/i;
-  }
-  return p[n&1];
-}
-
-/*--------------------------------------------------------------------------
-   Quadrature Nodes and Weights Calculation
-   compute the n Gauss-Legendre nodes and weights or
-           the n Gauss-Lobatto-Legendre nodes and weights
-  --------------------------------------------------------------------------*/
-
-/* n nodes */
-void gauss_nodes(double *restrict z, int n)
-{
-  int i,j;
-  for(i=0; i<=n/2-1; ++i) {
-    double ox, x = cos( (2*n-2*i-1)*(PI/2)/n );
-    do {
-      ox = x;
-      x -= legendre(n,x)/legendre_d1(n,x);
-    } while(fabs(x-ox)>-x*EPS);
-    z[i] = x - legendre(n,x)/legendre_d1(n,x);
-  }
-  if(n&1) z[n/2]=0;
-  for(j=(n+1)/2,i=n/2-1; j<n; ++j,--i) z[j]=-z[i];
-}
-
-/* n inner lobatto nodes (excluding -1,1) */
-static void lobatto_nodes_aux(double *restrict z, int n)
-{
-  int i,j,np=n+1;
-  for(i=0; i<=n/2-1; ++i) {
-    double ox, x = cos( (n-i)*PI/np );
-    do {
-      ox = x;
-      x -= legendre_d1(np,x)/legendre_d2(np,x);
-    } while(fabs(x-ox)>-x*EPS);
-    z[i] = x - legendre_d1(np,x)/legendre_d2(np,x);
-  }
-  if(n&1) z[n/2]=0;
-  for(j=(n+1)/2,i=n/2-1; j<n; ++j,--i) z[j]=-z[i];
-}
-
-/* n lobatto nodes */
-static void lobatto_nodes_n(double *restrict z, int n)
-{
-  z[0] = -1, z[n-1] = 1;
-  lobatto_nodes_aux(&z[1],n-2);
-}
-
-static void lobatto_nodes_fix(double *restrict z, int n)
-{
-  z[0] = -1, z[n-1] = 1;
-  if(n&1) z[n/2]=0;
-  if(n>=4) {
-    const double *restrict gllz = gllz_table[n-4];
-    int i,j;
-    for(i=1;i<=n/2-1;++i) z[i] = -gllz[i-1];
-    for(j=(n+1)/2,i=n/2-1; j<n-1; ++j,--i) z[j] = gllz[i-1];
-  }
-}
-
-void lobatto_nodes(double *restrict z, int n)
-{
-  if(n>GLL_LAG_FIX_MAX) lobatto_nodes_n(z,n);
-  else if(n>=2) lobatto_nodes_fix(z,n);
-}
-
-void gauss_quad(double *restrict z, double *restrict w, int n)
-{
-  int i,j;
-  gauss_nodes(z,n);
-  for(i=0; i<=(n-1)/2; ++i) {
-    double d = (n+1)*legendre(n+1,z[i]);
-    w[i] = 2*(1-z[i]*z[i])/(d*d);
-  }
-  for(j=(n+1)/2,i=n/2-1; j<n; ++j,--i) w[j]=w[i];
-}
-
-void lobatto_quad(double *restrict z, double *restrict w, int n)
-{
-  int i,j;
-  lobatto_nodes(z,n);
-  for(i=0; i<=(n-1)/2; ++i) {
-    double d = legendre(n-1,z[i]);
-    w[i] = 2/((n-1)*n*d*d);
-  }
-  for(j=(n+1)/2,i=n/2-1; j<n; ++j,--i) w[j]=w[i];
-}
-
-unsigned gll_lag_size(unsigned n)
-{
-  return (n<=GLL_LAG_FIX_MAX?1:9)*n;
-}
-
-lagrange_fun *gll_lag_setup(double *restrict data, int n)
-{
-  double *z, *w, *p;
-  lagrange_fun *f;
-  if(n<2) return 0;
-  p = tmalloc(double,2*n);
-  if(n<=GLL_LAG_FIX_MAX)
-    f=gll_lag_table[n-2], z=p+n, w=data;
-  else
-    f=&lagrange_eval, z=data, w=z+n;
-  lobatto_nodes(z,n);
-  lagrange_coef(p,data,w,z,n,f);
-  free(p);
-  return f;
-}
diff --git a/3rdParty/gslib/src/poly.h b/3rdParty/gslib/src/poly.h
deleted file mode 100644
index 2fa162a07..000000000
--- a/3rdParty/gslib/src/poly.h
+++ /dev/null
@@ -1,65 +0,0 @@
-#ifndef POLY_H
-#define POLY_H
-
-#if !defined(NAME_H)
-#warning "poly.h" requires "name.h"
-#endif
-
-#define lagrange_size  PREFIXED_NAME(lagrange_size )
-#define lagrange_setup PREFIXED_NAME(lagrange_setup)
-#define gauss_nodes    PREFIXED_NAME(gauss_nodes   )
-#define gauss_quad     PREFIXED_NAME(gauss_quad    )
-#define lobatto_nodes  PREFIXED_NAME(lobatto_nodes )
-#define lobatto_quad   PREFIXED_NAME(lobatto_quad  )
-#define gll_lag_size   PREFIXED_NAME(gll_lag_size  )
-#define gll_lag_setup  PREFIXED_NAME(gll_lag_setup )
-
-/*--------------------------------------------------------------------------
-   Quadrature Nodes and Weights Calculation
-
-    Gauss   -> Gauss-Legendre quadrature (open)
-    Lobatto -> Gauss-Lobatto-Legendre quadrature (closed at both ends)
-   
-   the _quad functions compute both nodes and weights
-  --------------------------------------------------------------------------*/
-
-void   gauss_nodes(double *restrict z, int n); /* n nodes (order = 2n-1) */
-void lobatto_nodes(double *restrict z, int n); /* n nodes (order = 2n-3) */
-
-void   gauss_quad(double *restrict z, double *restrict w, int n);
-void lobatto_quad(double *restrict z, double *restrict w, int n);
-
-/*--------------------------------------------------------------------------
-   Lagrangian basis function evaluation
-   
-   Usage:
-   
-   double z[N] = ..., x = ...; // nodes and evaluation point
-   double p[3*N];
-   double *data = tmalloc(double, lagrange_size(N));
-   lagrange_fun *const lag = lagrange_setup(data, z, N);
-   
-   int d = ...; // 0, 1, or 2  --- the highest derivative to compute
-   lag(p, data,N,d, x);
-   // now p[i] = h_i(x), 0 <= i < N 
-   // if d>=1, p[N+i] = h_i'(x)
-   // if d>=2, p[2*N+i] = h_i''(x)
-   free(data);
-   
-   gll_lag_* are similar, but are specialized  for GLL nodes, and faster,
-   and also don't need to be given the nodal locations
-  --------------------------------------------------------------------------*/
-
-typedef void lagrange_fun(double *restrict p,
-  double *restrict data, unsigned n, int d, double x);
-
-unsigned lagrange_size(unsigned n);
-lagrange_fun *lagrange_setup(
-  double *restrict data, const double *restrict z, unsigned n);
-
-unsigned gll_lag_size(unsigned n);
-lagrange_fun *gll_lag_setup(double *restrict data, int n);
-
-
-#endif
-
diff --git a/3rdParty/gslib/src/poly_imp.h b/3rdParty/gslib/src/poly_imp.h
deleted file mode 100644
index 6ca55a7a4..000000000
--- a/3rdParty/gslib/src/poly_imp.h
+++ /dev/null
@@ -1,1949 +0,0 @@
-/* generated by gen_poly_imp.c */
-
-#define GLL_LAG_FIX_MAX 24
-
-static void gll_lag_02(double *restrict p, double *restrict w,
-                       unsigned n, int d, double xh)
-{
-  const double x = xh*2;
-  const double d00=x+2            ,d01=x-2            ;
-  const double u0_01=    1*d00;
-  const double v0_00=d01*    1;
-  p[ 0]=w[ 0]*    1*v0_00; p[ 1]=w[ 1]*u0_01*    1;
-  if(d>0) {
-    p[2+ 0]=2*w[ 0]*(                      1);
-    p[2+ 1]=2*w[ 1]*(    1                  );
-    if(d>1) {
-      p[2*2+ 0]=0;
-      p[2*2+ 1]=0;
-    }
-  }
-}
-
-static void gll_lag_03(double *restrict p, double *restrict w,
-                       unsigned n, int d, double xh)
-{
-  const double x = xh*2;
-  const double d00=x+2            ,d01=x              ,d02=x-2            ;
-  const double u0_01=    1*d00,u0_02=u0_01*d01;
-  const double v0_01=d02*    1,v0_00=d01*v0_01;
-  p[ 0]=w[ 0]*    1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*    1;
-  if(d>0) {
-    const double u1_02=    1*d01+u0_01;
-    const double v1_00=d01*    1+v0_01;
-    p[3+ 0]=2*w[ 0]*(                  v1_00);
-    p[3+ 1]=2*w[ 1]*(    1*v0_01+u0_01*    1);
-    p[3+ 2]=2*w[ 2]*(u1_02                  );
-    if(d>1) {
-      p[2*3+ 0]=4*w[ 0]*(                         +    1*    2);
-      p[2*3+ 1]=4*w[ 1]*(           +2*    1*    1            );
-      p[2*3+ 2]=4*w[ 2]*(    2*    1                          );
-    }
-  }
-}
-
-static const double gllz_04[ 1] = {
-  0.44721359549995793928183473374625524708812367192231
-};
-
-static void gll_lag_04(double *restrict p, double *restrict w,
-                       unsigned n, int d, double xh)
-{
-  const double x = xh*2;
-  const double d00=x+2            ,d01=x+2*gllz_04[ 0],d02=x-2*gllz_04[ 0],
-               d03=x-2            ;
-  const double u0_01=    1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02;
-  const double v0_02=d03*    1,v0_01=d02*v0_02,v0_00=d01*v0_01;
-  p[ 0]=w[ 0]*    1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02;
-  p[ 3]=w[ 3]*u0_03*    1;
-  if(d>0) {
-    const double u1_02=    1*d01+u0_01,u1_03=u1_02*d02+u0_02;
-    const double v1_01=d02*    1+v0_02,v1_00=d01*v1_01+v0_01;
-    p[4+ 0]=2*w[ 0]*(                  v1_00);
-    p[4+ 1]=2*w[ 1]*(    1*v0_01+u0_01*v1_01);
-    p[4+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*    1);
-    p[4+ 3]=2*w[ 3]*(u1_03                  );
-    if(d>1) {
-      const double u2_03=    2*d02+2*u1_02;
-      const double v2_00=d01*    2+2*v1_01;
-      p[2*4+ 0]=4*w[ 0]*(                         +    1*v2_00);
-      p[2*4+ 1]=4*w[ 1]*(           +2*    1*v1_01+u0_01*    2);
-      p[2*4+ 2]=4*w[ 2]*(    2*v0_02+2*u1_02*    1            );
-      p[2*4+ 3]=4*w[ 3]*(u2_03*    1                          );
-    }
-  }
-}
-
-static const double gllz_05[ 1] = {
-  0.65465367070797714379829245624685835556920808239542
-};
-
-static void gll_lag_05(double *restrict p, double *restrict w,
-                       unsigned n, int d, double xh)
-{
-  const double x = xh*2;
-  const double d00=x+2            ,d01=x+2*gllz_05[ 0],d02=x              ,
-               d03=x-2*gllz_05[ 0],d04=x-2            ;
-  const double u0_01=    1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02,
-               u0_04=u0_03*d03;
-  const double v0_03=d04*    1,v0_02=d03*v0_03,v0_01=d02*v0_02,
-               v0_00=d01*v0_01;
-  p[ 0]=w[ 0]*    1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02;
-  p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*    1;
-  if(d>0) {
-    const double u1_02=    1*d01+u0_01,u1_03=u1_02*d02+u0_02,
-                 u1_04=u1_03*d03+u0_03;
-    const double v1_02=d03*    1+v0_03,v1_01=d02*v1_02+v0_02,
-                 v1_00=d01*v1_01+v0_01;
-    p[5+ 0]=2*w[ 0]*(                  v1_00);
-    p[5+ 1]=2*w[ 1]*(    1*v0_01+u0_01*v1_01);
-    p[5+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02);
-    p[5+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*    1);
-    p[5+ 4]=2*w[ 4]*(u1_04                  );
-    if(d>1) {
-      const double u2_03=    2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03;
-      const double v2_01=d02*    2+2*v1_02,v2_00=d01*v2_01+2*v1_01;
-      p[2*5+ 0]=4*w[ 0]*(                         +    1*v2_00);
-      p[2*5+ 1]=4*w[ 1]*(           +2*    1*v1_01+u0_01*v2_01);
-      p[2*5+ 2]=4*w[ 2]*(    2*v0_02+2*u1_02*v1_02+u0_02*    2);
-      p[2*5+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*    1            );
-      p[2*5+ 4]=4*w[ 4]*(u2_04*    1                          );
-    }
-  }
-}
-
-static const double gllz_06[ 2] = {
-  0.7650553239294646928510029739593381503657356885361,
-  0.28523151648064509631415099404087907191900347272643
-};
-
-static void gll_lag_06(double *restrict p, double *restrict w,
-                       unsigned n, int d, double xh)
-{
-  const double x = xh*2;
-  const double d00=x+2            ,d01=x+2*gllz_06[ 0],d02=x+2*gllz_06[ 1],
-               d03=x-2*gllz_06[ 1],d04=x-2*gllz_06[ 0],d05=x-2            ;
-  const double u0_01=    1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02,
-               u0_04=u0_03*d03,u0_05=u0_04*d04;
-  const double v0_04=d05*    1,v0_03=d04*v0_04,v0_02=d03*v0_03,
-               v0_01=d02*v0_02,v0_00=d01*v0_01;
-  p[ 0]=w[ 0]*    1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02;
-  p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*    1;
-  if(d>0) {
-    const double u1_02=    1*d01+u0_01,u1_03=u1_02*d02+u0_02,
-                 u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04;
-    const double v1_03=d04*    1+v0_04,v1_02=d03*v1_03+v0_03,
-                 v1_01=d02*v1_02+v0_02,v1_00=d01*v1_01+v0_01;
-    p[6+ 0]=2*w[ 0]*(                  v1_00);
-    p[6+ 1]=2*w[ 1]*(    1*v0_01+u0_01*v1_01);
-    p[6+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02);
-    p[6+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03);
-    p[6+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*    1);
-    p[6+ 5]=2*w[ 5]*(u1_05                  );
-    if(d>1) {
-      const double u2_03=    2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03,
-                   u2_05=u2_04*d04+2*u1_04;
-      const double v2_02=d03*    2+2*v1_03,v2_01=d02*v2_02+2*v1_02,
-                   v2_00=d01*v2_01+2*v1_01;
-      p[2*6+ 0]=4*w[ 0]*(                         +    1*v2_00);
-      p[2*6+ 1]=4*w[ 1]*(           +2*    1*v1_01+u0_01*v2_01);
-      p[2*6+ 2]=4*w[ 2]*(    2*v0_02+2*u1_02*v1_02+u0_02*v2_02);
-      p[2*6+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*    2);
-      p[2*6+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*    1            );
-      p[2*6+ 5]=4*w[ 5]*(u2_05*    1                          );
-    }
-  }
-}
-
-static const double gllz_07[ 2] = {
-  0.830223896278566929872032213967465139587170364872,
-  0.46884879347071421380377188190876632940559747167184
-};
-
-static void gll_lag_07(double *restrict p, double *restrict w,
-                       unsigned n, int d, double xh)
-{
-  const double x = xh*2;
-  const double d00=x+2            ,d01=x+2*gllz_07[ 0],d02=x+2*gllz_07[ 1],
-               d03=x              ,d04=x-2*gllz_07[ 1],d05=x-2*gllz_07[ 0],
-               d06=x-2            ;
-  const double u0_01=    1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02,
-               u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05;
-  const double v0_05=d06*    1,v0_04=d05*v0_05,v0_03=d04*v0_04,
-               v0_02=d03*v0_03,v0_01=d02*v0_02,v0_00=d01*v0_01;
-  p[ 0]=w[ 0]*    1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02;
-  p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05;
-  p[ 6]=w[ 6]*u0_06*    1;
-  if(d>0) {
-    const double u1_02=    1*d01+u0_01,u1_03=u1_02*d02+u0_02,
-                 u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04,
-                 u1_06=u1_05*d05+u0_05;
-    const double v1_04=d05*    1+v0_05,v1_03=d04*v1_04+v0_04,
-                 v1_02=d03*v1_03+v0_03,v1_01=d02*v1_02+v0_02,
-                 v1_00=d01*v1_01+v0_01;
-    p[7+ 0]=2*w[ 0]*(                  v1_00);
-    p[7+ 1]=2*w[ 1]*(    1*v0_01+u0_01*v1_01);
-    p[7+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02);
-    p[7+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03);
-    p[7+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04);
-    p[7+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*    1);
-    p[7+ 6]=2*w[ 6]*(u1_06                  );
-    if(d>1) {
-      const double u2_03=    2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03,
-                   u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05;
-      const double v2_03=d04*    2+2*v1_04,v2_02=d03*v2_03+2*v1_03,
-                   v2_01=d02*v2_02+2*v1_02,v2_00=d01*v2_01+2*v1_01;
-      p[2*7+ 0]=4*w[ 0]*(                         +    1*v2_00);
-      p[2*7+ 1]=4*w[ 1]*(           +2*    1*v1_01+u0_01*v2_01);
-      p[2*7+ 2]=4*w[ 2]*(    2*v0_02+2*u1_02*v1_02+u0_02*v2_02);
-      p[2*7+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03);
-      p[2*7+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*    2);
-      p[2*7+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*    1            );
-      p[2*7+ 6]=4*w[ 6]*(u2_06*    1                          );
-    }
-  }
-}
-
-static const double gllz_08[ 3] = {
-  0.87174014850960661533744576122066343810378066967698,
-  0.59170018143314230214451073139795318994570098951733,
-  0.20929921790247886876865726034535125529554540508668
-};
-
-static void gll_lag_08(double *restrict p, double *restrict w,
-                       unsigned n, int d, double xh)
-{
-  const double x = xh*2;
-  const double d00=x+2            ,d01=x+2*gllz_08[ 0],d02=x+2*gllz_08[ 1],
-               d03=x+2*gllz_08[ 2],d04=x-2*gllz_08[ 2],d05=x-2*gllz_08[ 1],
-               d06=x-2*gllz_08[ 0],d07=x-2            ;
-  const double u0_01=    1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02,
-               u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05,
-               u0_07=u0_06*d06;
-  const double v0_06=d07*    1,v0_05=d06*v0_06,v0_04=d05*v0_05,
-               v0_03=d04*v0_04,v0_02=d03*v0_03,v0_01=d02*v0_02,
-               v0_00=d01*v0_01;
-  p[ 0]=w[ 0]*    1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02;
-  p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05;
-  p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*    1;
-  if(d>0) {
-    const double u1_02=    1*d01+u0_01,u1_03=u1_02*d02+u0_02,
-                 u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04,
-                 u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06;
-    const double v1_05=d06*    1+v0_06,v1_04=d05*v1_05+v0_05,
-                 v1_03=d04*v1_04+v0_04,v1_02=d03*v1_03+v0_03,
-                 v1_01=d02*v1_02+v0_02,v1_00=d01*v1_01+v0_01;
-    p[8+ 0]=2*w[ 0]*(                  v1_00);
-    p[8+ 1]=2*w[ 1]*(    1*v0_01+u0_01*v1_01);
-    p[8+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02);
-    p[8+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03);
-    p[8+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04);
-    p[8+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05);
-    p[8+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*    1);
-    p[8+ 7]=2*w[ 7]*(u1_07                  );
-    if(d>1) {
-      const double u2_03=    2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03,
-                   u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05,
-                   u2_07=u2_06*d06+2*u1_06;
-      const double v2_04=d05*    2+2*v1_05,v2_03=d04*v2_04+2*v1_04,
-                   v2_02=d03*v2_03+2*v1_03,v2_01=d02*v2_02+2*v1_02,
-                   v2_00=d01*v2_01+2*v1_01;
-      p[2*8+ 0]=4*w[ 0]*(                         +    1*v2_00);
-      p[2*8+ 1]=4*w[ 1]*(           +2*    1*v1_01+u0_01*v2_01);
-      p[2*8+ 2]=4*w[ 2]*(    2*v0_02+2*u1_02*v1_02+u0_02*v2_02);
-      p[2*8+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03);
-      p[2*8+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04);
-      p[2*8+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*    2);
-      p[2*8+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*    1            );
-      p[2*8+ 7]=4*w[ 7]*(u2_07*    1                          );
-    }
-  }
-}
-
-static const double gllz_09[ 3] = {
-  0.8997579954114601573123452444183379580514802955661,
-  0.67718627951073775344588542709134245071102964761391,
-  0.36311746382617815871075206870865921302064227760088
-};
-
-static void gll_lag_09(double *restrict p, double *restrict w,
-                       unsigned n, int d, double xh)
-{
-  const double x = xh*2;
-  const double d00=x+2            ,d01=x+2*gllz_09[ 0],d02=x+2*gllz_09[ 1],
-               d03=x+2*gllz_09[ 2],d04=x              ,d05=x-2*gllz_09[ 2],
-               d06=x-2*gllz_09[ 1],d07=x-2*gllz_09[ 0],d08=x-2            ;
-  const double u0_01=    1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02,
-               u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05,
-               u0_07=u0_06*d06,u0_08=u0_07*d07;
-  const double v0_07=d08*    1,v0_06=d07*v0_07,v0_05=d06*v0_06,
-               v0_04=d05*v0_05,v0_03=d04*v0_04,v0_02=d03*v0_03,
-               v0_01=d02*v0_02,v0_00=d01*v0_01;
-  p[ 0]=w[ 0]*    1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02;
-  p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05;
-  p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*    1;
-  if(d>0) {
-    const double u1_02=    1*d01+u0_01,u1_03=u1_02*d02+u0_02,
-                 u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04,
-                 u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06,
-                 u1_08=u1_07*d07+u0_07;
-    const double v1_06=d07*    1+v0_07,v1_05=d06*v1_06+v0_06,
-                 v1_04=d05*v1_05+v0_05,v1_03=d04*v1_04+v0_04,
-                 v1_02=d03*v1_03+v0_03,v1_01=d02*v1_02+v0_02,
-                 v1_00=d01*v1_01+v0_01;
-    p[9+ 0]=2*w[ 0]*(                  v1_00);
-    p[9+ 1]=2*w[ 1]*(    1*v0_01+u0_01*v1_01);
-    p[9+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02);
-    p[9+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03);
-    p[9+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04);
-    p[9+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05);
-    p[9+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06);
-    p[9+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*    1);
-    p[9+ 8]=2*w[ 8]*(u1_08                  );
-    if(d>1) {
-      const double u2_03=    2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03,
-                   u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05,
-                   u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07;
-      const double v2_05=d06*    2+2*v1_06,v2_04=d05*v2_05+2*v1_05,
-                   v2_03=d04*v2_04+2*v1_04,v2_02=d03*v2_03+2*v1_03,
-                   v2_01=d02*v2_02+2*v1_02,v2_00=d01*v2_01+2*v1_01;
-      p[2*9+ 0]=4*w[ 0]*(                         +    1*v2_00);
-      p[2*9+ 1]=4*w[ 1]*(           +2*    1*v1_01+u0_01*v2_01);
-      p[2*9+ 2]=4*w[ 2]*(    2*v0_02+2*u1_02*v1_02+u0_02*v2_02);
-      p[2*9+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03);
-      p[2*9+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04);
-      p[2*9+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05);
-      p[2*9+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*    2);
-      p[2*9+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*    1            );
-      p[2*9+ 8]=4*w[ 8]*(u2_08*    1                          );
-    }
-  }
-}
-
-static const double gllz_10[ 4] = {
-  0.91953390816645881382893266082233813415354307544628,
-  0.73877386510550507500310617485983072501618510137693,
-  0.47792494981044449566117509273125799788677289333057,
-  0.16527895766638702462621976595817353323115034354948
-};
-
-static void gll_lag_10(double *restrict p, double *restrict w,
-                       unsigned n, int d, double xh)
-{
-  const double x = xh*2;
-  const double d00=x+2            ,d01=x+2*gllz_10[ 0],d02=x+2*gllz_10[ 1],
-               d03=x+2*gllz_10[ 2],d04=x+2*gllz_10[ 3],d05=x-2*gllz_10[ 3],
-               d06=x-2*gllz_10[ 2],d07=x-2*gllz_10[ 1],d08=x-2*gllz_10[ 0],
-               d09=x-2            ;
-  const double u0_01=    1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02,
-               u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05,
-               u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08;
-  const double v0_08=d09*    1,v0_07=d08*v0_08,v0_06=d07*v0_07,
-               v0_05=d06*v0_06,v0_04=d05*v0_05,v0_03=d04*v0_04,
-               v0_02=d03*v0_03,v0_01=d02*v0_02,v0_00=d01*v0_01;
-  p[ 0]=w[ 0]*    1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02;
-  p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05;
-  p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08;
-  p[ 9]=w[ 9]*u0_09*    1;
-  if(d>0) {
-    const double u1_02=    1*d01+u0_01,u1_03=u1_02*d02+u0_02,
-                 u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04,
-                 u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06,
-                 u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08;
-    const double v1_07=d08*    1+v0_08,v1_06=d07*v1_07+v0_07,
-                 v1_05=d06*v1_06+v0_06,v1_04=d05*v1_05+v0_05,
-                 v1_03=d04*v1_04+v0_04,v1_02=d03*v1_03+v0_03,
-                 v1_01=d02*v1_02+v0_02,v1_00=d01*v1_01+v0_01;
-    p[10+ 0]=2*w[ 0]*(                  v1_00);
-    p[10+ 1]=2*w[ 1]*(    1*v0_01+u0_01*v1_01);
-    p[10+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02);
-    p[10+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03);
-    p[10+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04);
-    p[10+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05);
-    p[10+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06);
-    p[10+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07);
-    p[10+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*    1);
-    p[10+ 9]=2*w[ 9]*(u1_09                  );
-    if(d>1) {
-      const double u2_03=    2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03,
-                   u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05,
-                   u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07,
-                   u2_09=u2_08*d08+2*u1_08;
-      const double v2_06=d07*    2+2*v1_07,v2_05=d06*v2_06+2*v1_06,
-                   v2_04=d05*v2_05+2*v1_05,v2_03=d04*v2_04+2*v1_04,
-                   v2_02=d03*v2_03+2*v1_03,v2_01=d02*v2_02+2*v1_02,
-                   v2_00=d01*v2_01+2*v1_01;
-      p[2*10+ 0]=4*w[ 0]*(                         +    1*v2_00);
-      p[2*10+ 1]=4*w[ 1]*(           +2*    1*v1_01+u0_01*v2_01);
-      p[2*10+ 2]=4*w[ 2]*(    2*v0_02+2*u1_02*v1_02+u0_02*v2_02);
-      p[2*10+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03);
-      p[2*10+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04);
-      p[2*10+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05);
-      p[2*10+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06);
-      p[2*10+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*    2);
-      p[2*10+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*    1            );
-      p[2*10+ 9]=4*w[ 9]*(u2_09*    1                          );
-    }
-  }
-}
-
-static const double gllz_11[ 4] = {
-  0.93400143040805913433227413609938363453991733010996,
-  0.78448347366314441862241781610845810350719745509406,
-  0.56523532699620500647096396947775166428305214556202,
-  0.2957581355869393914319115155590575089410064343486
-};
-
-static void gll_lag_11(double *restrict p, double *restrict w,
-                       unsigned n, int d, double xh)
-{
-  const double x = xh*2;
-  const double d00=x+2            ,d01=x+2*gllz_11[ 0],d02=x+2*gllz_11[ 1],
-               d03=x+2*gllz_11[ 2],d04=x+2*gllz_11[ 3],d05=x              ,
-               d06=x-2*gllz_11[ 3],d07=x-2*gllz_11[ 2],d08=x-2*gllz_11[ 1],
-               d09=x-2*gllz_11[ 0],d10=x-2            ;
-  const double u0_01=    1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02,
-               u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05,
-               u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08,
-               u0_10=u0_09*d09;
-  const double v0_09=d10*    1,v0_08=d09*v0_09,v0_07=d08*v0_08,
-               v0_06=d07*v0_07,v0_05=d06*v0_06,v0_04=d05*v0_05,
-               v0_03=d04*v0_04,v0_02=d03*v0_03,v0_01=d02*v0_02,
-               v0_00=d01*v0_01;
-  p[ 0]=w[ 0]*    1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02;
-  p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05;
-  p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08;
-  p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*    1;
-  if(d>0) {
-    const double u1_02=    1*d01+u0_01,u1_03=u1_02*d02+u0_02,
-                 u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04,
-                 u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06,
-                 u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08,
-                 u1_10=u1_09*d09+u0_09;
-    const double v1_08=d09*    1+v0_09,v1_07=d08*v1_08+v0_08,
-                 v1_06=d07*v1_07+v0_07,v1_05=d06*v1_06+v0_06,
-                 v1_04=d05*v1_05+v0_05,v1_03=d04*v1_04+v0_04,
-                 v1_02=d03*v1_03+v0_03,v1_01=d02*v1_02+v0_02,
-                 v1_00=d01*v1_01+v0_01;
-    p[11+ 0]=2*w[ 0]*(                  v1_00);
-    p[11+ 1]=2*w[ 1]*(    1*v0_01+u0_01*v1_01);
-    p[11+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02);
-    p[11+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03);
-    p[11+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04);
-    p[11+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05);
-    p[11+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06);
-    p[11+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07);
-    p[11+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08);
-    p[11+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*    1);
-    p[11+10]=2*w[10]*(u1_10                  );
-    if(d>1) {
-      const double u2_03=    2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03,
-                   u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05,
-                   u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07,
-                   u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09;
-      const double v2_07=d08*    2+2*v1_08,v2_06=d07*v2_07+2*v1_07,
-                   v2_05=d06*v2_06+2*v1_06,v2_04=d05*v2_05+2*v1_05,
-                   v2_03=d04*v2_04+2*v1_04,v2_02=d03*v2_03+2*v1_03,
-                   v2_01=d02*v2_02+2*v1_02,v2_00=d01*v2_01+2*v1_01;
-      p[2*11+ 0]=4*w[ 0]*(                         +    1*v2_00);
-      p[2*11+ 1]=4*w[ 1]*(           +2*    1*v1_01+u0_01*v2_01);
-      p[2*11+ 2]=4*w[ 2]*(    2*v0_02+2*u1_02*v1_02+u0_02*v2_02);
-      p[2*11+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03);
-      p[2*11+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04);
-      p[2*11+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05);
-      p[2*11+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06);
-      p[2*11+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07);
-      p[2*11+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*    2);
-      p[2*11+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*    1            );
-      p[2*11+10]=4*w[10]*(u2_10*    1                          );
-    }
-  }
-}
-
-static const double gllz_12[ 5] = {
-  0.94489927222288222340758013830321871361125655195003,
-  0.81927932164400667834864158171690266069046665790364,
-  0.6328761530318606776624048544436558582438437454015,
-  0.39953094096534893226434979156696690052774803279531,
-  0.13655293285492755486406185573969389689841411128206
-};
-
-static void gll_lag_12(double *restrict p, double *restrict w,
-                       unsigned n, int d, double xh)
-{
-  const double x = xh*2;
-  const double d00=x+2            ,d01=x+2*gllz_12[ 0],d02=x+2*gllz_12[ 1],
-               d03=x+2*gllz_12[ 2],d04=x+2*gllz_12[ 3],d05=x+2*gllz_12[ 4],
-               d06=x-2*gllz_12[ 4],d07=x-2*gllz_12[ 3],d08=x-2*gllz_12[ 2],
-               d09=x-2*gllz_12[ 1],d10=x-2*gllz_12[ 0],d11=x-2            ;
-  const double u0_01=    1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02,
-               u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05,
-               u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08,
-               u0_10=u0_09*d09,u0_11=u0_10*d10;
-  const double v0_10=d11*    1,v0_09=d10*v0_10,v0_08=d09*v0_09,
-               v0_07=d08*v0_08,v0_06=d07*v0_07,v0_05=d06*v0_06,
-               v0_04=d05*v0_05,v0_03=d04*v0_04,v0_02=d03*v0_03,
-               v0_01=d02*v0_02,v0_00=d01*v0_01;
-  p[ 0]=w[ 0]*    1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02;
-  p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05;
-  p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08;
-  p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*    1;
-  if(d>0) {
-    const double u1_02=    1*d01+u0_01,u1_03=u1_02*d02+u0_02,
-                 u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04,
-                 u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06,
-                 u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08,
-                 u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10;
-    const double v1_09=d10*    1+v0_10,v1_08=d09*v1_09+v0_09,
-                 v1_07=d08*v1_08+v0_08,v1_06=d07*v1_07+v0_07,
-                 v1_05=d06*v1_06+v0_06,v1_04=d05*v1_05+v0_05,
-                 v1_03=d04*v1_04+v0_04,v1_02=d03*v1_03+v0_03,
-                 v1_01=d02*v1_02+v0_02,v1_00=d01*v1_01+v0_01;
-    p[12+ 0]=2*w[ 0]*(                  v1_00);
-    p[12+ 1]=2*w[ 1]*(    1*v0_01+u0_01*v1_01);
-    p[12+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02);
-    p[12+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03);
-    p[12+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04);
-    p[12+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05);
-    p[12+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06);
-    p[12+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07);
-    p[12+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08);
-    p[12+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09);
-    p[12+10]=2*w[10]*(u1_10*v0_10+u0_10*    1);
-    p[12+11]=2*w[11]*(u1_11                  );
-    if(d>1) {
-      const double u2_03=    2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03,
-                   u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05,
-                   u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07,
-                   u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09,
-                   u2_11=u2_10*d10+2*u1_10;
-      const double v2_08=d09*    2+2*v1_09,v2_07=d08*v2_08+2*v1_08,
-                   v2_06=d07*v2_07+2*v1_07,v2_05=d06*v2_06+2*v1_06,
-                   v2_04=d05*v2_05+2*v1_05,v2_03=d04*v2_04+2*v1_04,
-                   v2_02=d03*v2_03+2*v1_03,v2_01=d02*v2_02+2*v1_02,
-                   v2_00=d01*v2_01+2*v1_01;
-      p[2*12+ 0]=4*w[ 0]*(                         +    1*v2_00);
-      p[2*12+ 1]=4*w[ 1]*(           +2*    1*v1_01+u0_01*v2_01);
-      p[2*12+ 2]=4*w[ 2]*(    2*v0_02+2*u1_02*v1_02+u0_02*v2_02);
-      p[2*12+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03);
-      p[2*12+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04);
-      p[2*12+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05);
-      p[2*12+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06);
-      p[2*12+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07);
-      p[2*12+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08);
-      p[2*12+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*    2);
-      p[2*12+10]=4*w[10]*(u2_10*v0_10+2*u1_10*    1            );
-      p[2*12+11]=4*w[11]*(u2_11*    1                          );
-    }
-  }
-}
-
-static const double gllz_13[ 5] = {
-  0.95330984664216391189690546475544915162650788869736,
-  0.84634756465187231686592560709875335957803665971441,
-  0.68618846908175742607275903956635555292917619812438,
-  0.48290982109133620174693723363693362077219326211859,
-  0.24928693010623999256867370037422698148881131249298
-};
-
-static void gll_lag_13(double *restrict p, double *restrict w,
-                       unsigned n, int d, double xh)
-{
-  const double x = xh*2;
-  const double d00=x+2            ,d01=x+2*gllz_13[ 0],d02=x+2*gllz_13[ 1],
-               d03=x+2*gllz_13[ 2],d04=x+2*gllz_13[ 3],d05=x+2*gllz_13[ 4],
-               d06=x              ,d07=x-2*gllz_13[ 4],d08=x-2*gllz_13[ 3],
-               d09=x-2*gllz_13[ 2],d10=x-2*gllz_13[ 1],d11=x-2*gllz_13[ 0],
-               d12=x-2            ;
-  const double u0_01=    1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02,
-               u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05,
-               u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08,
-               u0_10=u0_09*d09,u0_11=u0_10*d10,u0_12=u0_11*d11;
-  const double v0_11=d12*    1,v0_10=d11*v0_11,v0_09=d10*v0_10,
-               v0_08=d09*v0_09,v0_07=d08*v0_08,v0_06=d07*v0_07,
-               v0_05=d06*v0_06,v0_04=d05*v0_05,v0_03=d04*v0_04,
-               v0_02=d03*v0_03,v0_01=d02*v0_02,v0_00=d01*v0_01;
-  p[ 0]=w[ 0]*    1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02;
-  p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05;
-  p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08;
-  p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*v0_11;
-  p[12]=w[12]*u0_12*    1;
-  if(d>0) {
-    const double u1_02=    1*d01+u0_01,u1_03=u1_02*d02+u0_02,
-                 u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04,
-                 u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06,
-                 u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08,
-                 u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10,
-                 u1_12=u1_11*d11+u0_11;
-    const double v1_10=d11*    1+v0_11,v1_09=d10*v1_10+v0_10,
-                 v1_08=d09*v1_09+v0_09,v1_07=d08*v1_08+v0_08,
-                 v1_06=d07*v1_07+v0_07,v1_05=d06*v1_06+v0_06,
-                 v1_04=d05*v1_05+v0_05,v1_03=d04*v1_04+v0_04,
-                 v1_02=d03*v1_03+v0_03,v1_01=d02*v1_02+v0_02,
-                 v1_00=d01*v1_01+v0_01;
-    p[13+ 0]=2*w[ 0]*(                  v1_00);
-    p[13+ 1]=2*w[ 1]*(    1*v0_01+u0_01*v1_01);
-    p[13+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02);
-    p[13+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03);
-    p[13+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04);
-    p[13+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05);
-    p[13+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06);
-    p[13+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07);
-    p[13+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08);
-    p[13+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09);
-    p[13+10]=2*w[10]*(u1_10*v0_10+u0_10*v1_10);
-    p[13+11]=2*w[11]*(u1_11*v0_11+u0_11*    1);
-    p[13+12]=2*w[12]*(u1_12                  );
-    if(d>1) {
-      const double u2_03=    2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03,
-                   u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05,
-                   u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07,
-                   u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09,
-                   u2_11=u2_10*d10+2*u1_10,u2_12=u2_11*d11+2*u1_11;
-      const double v2_09=d10*    2+2*v1_10,v2_08=d09*v2_09+2*v1_09,
-                   v2_07=d08*v2_08+2*v1_08,v2_06=d07*v2_07+2*v1_07,
-                   v2_05=d06*v2_06+2*v1_06,v2_04=d05*v2_05+2*v1_05,
-                   v2_03=d04*v2_04+2*v1_04,v2_02=d03*v2_03+2*v1_03,
-                   v2_01=d02*v2_02+2*v1_02,v2_00=d01*v2_01+2*v1_01;
-      p[2*13+ 0]=4*w[ 0]*(                         +    1*v2_00);
-      p[2*13+ 1]=4*w[ 1]*(           +2*    1*v1_01+u0_01*v2_01);
-      p[2*13+ 2]=4*w[ 2]*(    2*v0_02+2*u1_02*v1_02+u0_02*v2_02);
-      p[2*13+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03);
-      p[2*13+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04);
-      p[2*13+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05);
-      p[2*13+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06);
-      p[2*13+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07);
-      p[2*13+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08);
-      p[2*13+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*v2_09);
-      p[2*13+10]=4*w[10]*(u2_10*v0_10+2*u1_10*v1_10+u0_10*    2);
-      p[2*13+11]=4*w[11]*(u2_11*v0_11+2*u1_11*    1            );
-      p[2*13+12]=4*w[12]*(u2_12*    1                          );
-    }
-  }
-}
-
-static const double gllz_14[ 6] = {
-  0.95993504526726090135510016201542438906639151857265,
-  0.86780105383034725100022020290826421324987235309444,
-  0.72886859909132614058467240052088159565733953169432,
-  0.55063940292864705531662270585908063446213831955391,
-  0.34272401334271284504390340364167464483311353414031,
-  0.11633186888370386765877670973616016794150904425628
-};
-
-static void gll_lag_14(double *restrict p, double *restrict w,
-                       unsigned n, int d, double xh)
-{
-  const double x = xh*2;
-  const double d00=x+2            ,d01=x+2*gllz_14[ 0],d02=x+2*gllz_14[ 1],
-               d03=x+2*gllz_14[ 2],d04=x+2*gllz_14[ 3],d05=x+2*gllz_14[ 4],
-               d06=x+2*gllz_14[ 5],d07=x-2*gllz_14[ 5],d08=x-2*gllz_14[ 4],
-               d09=x-2*gllz_14[ 3],d10=x-2*gllz_14[ 2],d11=x-2*gllz_14[ 1],
-               d12=x-2*gllz_14[ 0],d13=x-2            ;
-  const double u0_01=    1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02,
-               u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05,
-               u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08,
-               u0_10=u0_09*d09,u0_11=u0_10*d10,u0_12=u0_11*d11,
-               u0_13=u0_12*d12;
-  const double v0_12=d13*    1,v0_11=d12*v0_12,v0_10=d11*v0_11,
-               v0_09=d10*v0_10,v0_08=d09*v0_09,v0_07=d08*v0_08,
-               v0_06=d07*v0_07,v0_05=d06*v0_06,v0_04=d05*v0_05,
-               v0_03=d04*v0_04,v0_02=d03*v0_03,v0_01=d02*v0_02,
-               v0_00=d01*v0_01;
-  p[ 0]=w[ 0]*    1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02;
-  p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05;
-  p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08;
-  p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*v0_11;
-  p[12]=w[12]*u0_12*v0_12; p[13]=w[13]*u0_13*    1;
-  if(d>0) {
-    const double u1_02=    1*d01+u0_01,u1_03=u1_02*d02+u0_02,
-                 u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04,
-                 u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06,
-                 u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08,
-                 u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10,
-                 u1_12=u1_11*d11+u0_11,u1_13=u1_12*d12+u0_12;
-    const double v1_11=d12*    1+v0_12,v1_10=d11*v1_11+v0_11,
-                 v1_09=d10*v1_10+v0_10,v1_08=d09*v1_09+v0_09,
-                 v1_07=d08*v1_08+v0_08,v1_06=d07*v1_07+v0_07,
-                 v1_05=d06*v1_06+v0_06,v1_04=d05*v1_05+v0_05,
-                 v1_03=d04*v1_04+v0_04,v1_02=d03*v1_03+v0_03,
-                 v1_01=d02*v1_02+v0_02,v1_00=d01*v1_01+v0_01;
-    p[14+ 0]=2*w[ 0]*(                  v1_00);
-    p[14+ 1]=2*w[ 1]*(    1*v0_01+u0_01*v1_01);
-    p[14+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02);
-    p[14+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03);
-    p[14+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04);
-    p[14+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05);
-    p[14+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06);
-    p[14+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07);
-    p[14+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08);
-    p[14+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09);
-    p[14+10]=2*w[10]*(u1_10*v0_10+u0_10*v1_10);
-    p[14+11]=2*w[11]*(u1_11*v0_11+u0_11*v1_11);
-    p[14+12]=2*w[12]*(u1_12*v0_12+u0_12*    1);
-    p[14+13]=2*w[13]*(u1_13                  );
-    if(d>1) {
-      const double u2_03=    2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03,
-                   u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05,
-                   u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07,
-                   u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09,
-                   u2_11=u2_10*d10+2*u1_10,u2_12=u2_11*d11+2*u1_11,
-                   u2_13=u2_12*d12+2*u1_12;
-      const double v2_10=d11*    2+2*v1_11,v2_09=d10*v2_10+2*v1_10,
-                   v2_08=d09*v2_09+2*v1_09,v2_07=d08*v2_08+2*v1_08,
-                   v2_06=d07*v2_07+2*v1_07,v2_05=d06*v2_06+2*v1_06,
-                   v2_04=d05*v2_05+2*v1_05,v2_03=d04*v2_04+2*v1_04,
-                   v2_02=d03*v2_03+2*v1_03,v2_01=d02*v2_02+2*v1_02,
-                   v2_00=d01*v2_01+2*v1_01;
-      p[2*14+ 0]=4*w[ 0]*(                         +    1*v2_00);
-      p[2*14+ 1]=4*w[ 1]*(           +2*    1*v1_01+u0_01*v2_01);
-      p[2*14+ 2]=4*w[ 2]*(    2*v0_02+2*u1_02*v1_02+u0_02*v2_02);
-      p[2*14+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03);
-      p[2*14+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04);
-      p[2*14+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05);
-      p[2*14+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06);
-      p[2*14+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07);
-      p[2*14+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08);
-      p[2*14+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*v2_09);
-      p[2*14+10]=4*w[10]*(u2_10*v0_10+2*u1_10*v1_10+u0_10*v2_10);
-      p[2*14+11]=4*w[11]*(u2_11*v0_11+2*u1_11*v1_11+u0_11*    2);
-      p[2*14+12]=4*w[12]*(u2_12*v0_12+2*u1_12*    1            );
-      p[2*14+13]=4*w[13]*(u2_13*    1                          );
-    }
-  }
-}
-
-static const double gllz_15[ 6] = {
-  0.96524592650383857279585139206960117770765013599709,
-  0.88508204422297629882540163148222965198871408520748,
-  0.76351968995181520070411847597629161817736852031529,
-  0.60625320546984571112352993863673350717973103375992,
-  0.42063805471367248092189693873858041298433820549243,
-  0.21535395536379423822567944627291771265215790120304
-};
-
-static void gll_lag_15(double *restrict p, double *restrict w,
-                       unsigned n, int d, double xh)
-{
-  const double x = xh*2;
-  const double d00=x+2            ,d01=x+2*gllz_15[ 0],d02=x+2*gllz_15[ 1],
-               d03=x+2*gllz_15[ 2],d04=x+2*gllz_15[ 3],d05=x+2*gllz_15[ 4],
-               d06=x+2*gllz_15[ 5],d07=x              ,d08=x-2*gllz_15[ 5],
-               d09=x-2*gllz_15[ 4],d10=x-2*gllz_15[ 3],d11=x-2*gllz_15[ 2],
-               d12=x-2*gllz_15[ 1],d13=x-2*gllz_15[ 0],d14=x-2            ;
-  const double u0_01=    1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02,
-               u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05,
-               u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08,
-               u0_10=u0_09*d09,u0_11=u0_10*d10,u0_12=u0_11*d11,
-               u0_13=u0_12*d12,u0_14=u0_13*d13;
-  const double v0_13=d14*    1,v0_12=d13*v0_13,v0_11=d12*v0_12,
-               v0_10=d11*v0_11,v0_09=d10*v0_10,v0_08=d09*v0_09,
-               v0_07=d08*v0_08,v0_06=d07*v0_07,v0_05=d06*v0_06,
-               v0_04=d05*v0_05,v0_03=d04*v0_04,v0_02=d03*v0_03,
-               v0_01=d02*v0_02,v0_00=d01*v0_01;
-  p[ 0]=w[ 0]*    1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02;
-  p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05;
-  p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08;
-  p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*v0_11;
-  p[12]=w[12]*u0_12*v0_12; p[13]=w[13]*u0_13*v0_13; p[14]=w[14]*u0_14*    1;
-  if(d>0) {
-    const double u1_02=    1*d01+u0_01,u1_03=u1_02*d02+u0_02,
-                 u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04,
-                 u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06,
-                 u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08,
-                 u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10,
-                 u1_12=u1_11*d11+u0_11,u1_13=u1_12*d12+u0_12,
-                 u1_14=u1_13*d13+u0_13;
-    const double v1_12=d13*    1+v0_13,v1_11=d12*v1_12+v0_12,
-                 v1_10=d11*v1_11+v0_11,v1_09=d10*v1_10+v0_10,
-                 v1_08=d09*v1_09+v0_09,v1_07=d08*v1_08+v0_08,
-                 v1_06=d07*v1_07+v0_07,v1_05=d06*v1_06+v0_06,
-                 v1_04=d05*v1_05+v0_05,v1_03=d04*v1_04+v0_04,
-                 v1_02=d03*v1_03+v0_03,v1_01=d02*v1_02+v0_02,
-                 v1_00=d01*v1_01+v0_01;
-    p[15+ 0]=2*w[ 0]*(                  v1_00);
-    p[15+ 1]=2*w[ 1]*(    1*v0_01+u0_01*v1_01);
-    p[15+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02);
-    p[15+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03);
-    p[15+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04);
-    p[15+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05);
-    p[15+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06);
-    p[15+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07);
-    p[15+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08);
-    p[15+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09);
-    p[15+10]=2*w[10]*(u1_10*v0_10+u0_10*v1_10);
-    p[15+11]=2*w[11]*(u1_11*v0_11+u0_11*v1_11);
-    p[15+12]=2*w[12]*(u1_12*v0_12+u0_12*v1_12);
-    p[15+13]=2*w[13]*(u1_13*v0_13+u0_13*    1);
-    p[15+14]=2*w[14]*(u1_14                  );
-    if(d>1) {
-      const double u2_03=    2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03,
-                   u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05,
-                   u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07,
-                   u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09,
-                   u2_11=u2_10*d10+2*u1_10,u2_12=u2_11*d11+2*u1_11,
-                   u2_13=u2_12*d12+2*u1_12,u2_14=u2_13*d13+2*u1_13;
-      const double v2_11=d12*    2+2*v1_12,v2_10=d11*v2_11+2*v1_11,
-                   v2_09=d10*v2_10+2*v1_10,v2_08=d09*v2_09+2*v1_09,
-                   v2_07=d08*v2_08+2*v1_08,v2_06=d07*v2_07+2*v1_07,
-                   v2_05=d06*v2_06+2*v1_06,v2_04=d05*v2_05+2*v1_05,
-                   v2_03=d04*v2_04+2*v1_04,v2_02=d03*v2_03+2*v1_03,
-                   v2_01=d02*v2_02+2*v1_02,v2_00=d01*v2_01+2*v1_01;
-      p[2*15+ 0]=4*w[ 0]*(                         +    1*v2_00);
-      p[2*15+ 1]=4*w[ 1]*(           +2*    1*v1_01+u0_01*v2_01);
-      p[2*15+ 2]=4*w[ 2]*(    2*v0_02+2*u1_02*v1_02+u0_02*v2_02);
-      p[2*15+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03);
-      p[2*15+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04);
-      p[2*15+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05);
-      p[2*15+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06);
-      p[2*15+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07);
-      p[2*15+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08);
-      p[2*15+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*v2_09);
-      p[2*15+10]=4*w[10]*(u2_10*v0_10+2*u1_10*v1_10+u0_10*v2_10);
-      p[2*15+11]=4*w[11]*(u2_11*v0_11+2*u1_11*v1_11+u0_11*v2_11);
-      p[2*15+12]=4*w[12]*(u2_12*v0_12+2*u1_12*v1_12+u0_12*    2);
-      p[2*15+13]=4*w[13]*(u2_13*v0_13+2*u1_13*    1            );
-      p[2*15+14]=4*w[14]*(u2_14*    1                          );
-    }
-  }
-}
-
-static const double gllz_16[ 7] = {
-  0.96956804627021793295224273836745924138899074650383,
-  0.89920053309347209299462826151984947674999760904514,
-  0.7920082918618150639310882709631457058080738279802,
-  0.65238870288249308946788321964058148032155801282957,
-  0.48605942188713761178189078584687469688897730429825,
-  0.29983046890076320809835345472230064781546097690778,
-  0.10132627352194944784303300504591776253324091440019
-};
-
-static void gll_lag_16(double *restrict p, double *restrict w,
-                       unsigned n, int d, double xh)
-{
-  const double x = xh*2;
-  const double d00=x+2            ,d01=x+2*gllz_16[ 0],d02=x+2*gllz_16[ 1],
-               d03=x+2*gllz_16[ 2],d04=x+2*gllz_16[ 3],d05=x+2*gllz_16[ 4],
-               d06=x+2*gllz_16[ 5],d07=x+2*gllz_16[ 6],d08=x-2*gllz_16[ 6],
-               d09=x-2*gllz_16[ 5],d10=x-2*gllz_16[ 4],d11=x-2*gllz_16[ 3],
-               d12=x-2*gllz_16[ 2],d13=x-2*gllz_16[ 1],d14=x-2*gllz_16[ 0],
-               d15=x-2            ;
-  const double u0_01=    1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02,
-               u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05,
-               u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08,
-               u0_10=u0_09*d09,u0_11=u0_10*d10,u0_12=u0_11*d11,
-               u0_13=u0_12*d12,u0_14=u0_13*d13,u0_15=u0_14*d14;
-  const double v0_14=d15*    1,v0_13=d14*v0_14,v0_12=d13*v0_13,
-               v0_11=d12*v0_12,v0_10=d11*v0_11,v0_09=d10*v0_10,
-               v0_08=d09*v0_09,v0_07=d08*v0_08,v0_06=d07*v0_07,
-               v0_05=d06*v0_06,v0_04=d05*v0_05,v0_03=d04*v0_04,
-               v0_02=d03*v0_03,v0_01=d02*v0_02,v0_00=d01*v0_01;
-  p[ 0]=w[ 0]*    1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02;
-  p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05;
-  p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08;
-  p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*v0_11;
-  p[12]=w[12]*u0_12*v0_12; p[13]=w[13]*u0_13*v0_13; p[14]=w[14]*u0_14*v0_14;
-  p[15]=w[15]*u0_15*    1;
-  if(d>0) {
-    const double u1_02=    1*d01+u0_01,u1_03=u1_02*d02+u0_02,
-                 u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04,
-                 u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06,
-                 u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08,
-                 u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10,
-                 u1_12=u1_11*d11+u0_11,u1_13=u1_12*d12+u0_12,
-                 u1_14=u1_13*d13+u0_13,u1_15=u1_14*d14+u0_14;
-    const double v1_13=d14*    1+v0_14,v1_12=d13*v1_13+v0_13,
-                 v1_11=d12*v1_12+v0_12,v1_10=d11*v1_11+v0_11,
-                 v1_09=d10*v1_10+v0_10,v1_08=d09*v1_09+v0_09,
-                 v1_07=d08*v1_08+v0_08,v1_06=d07*v1_07+v0_07,
-                 v1_05=d06*v1_06+v0_06,v1_04=d05*v1_05+v0_05,
-                 v1_03=d04*v1_04+v0_04,v1_02=d03*v1_03+v0_03,
-                 v1_01=d02*v1_02+v0_02,v1_00=d01*v1_01+v0_01;
-    p[16+ 0]=2*w[ 0]*(                  v1_00);
-    p[16+ 1]=2*w[ 1]*(    1*v0_01+u0_01*v1_01);
-    p[16+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02);
-    p[16+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03);
-    p[16+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04);
-    p[16+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05);
-    p[16+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06);
-    p[16+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07);
-    p[16+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08);
-    p[16+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09);
-    p[16+10]=2*w[10]*(u1_10*v0_10+u0_10*v1_10);
-    p[16+11]=2*w[11]*(u1_11*v0_11+u0_11*v1_11);
-    p[16+12]=2*w[12]*(u1_12*v0_12+u0_12*v1_12);
-    p[16+13]=2*w[13]*(u1_13*v0_13+u0_13*v1_13);
-    p[16+14]=2*w[14]*(u1_14*v0_14+u0_14*    1);
-    p[16+15]=2*w[15]*(u1_15                  );
-    if(d>1) {
-      const double u2_03=    2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03,
-                   u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05,
-                   u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07,
-                   u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09,
-                   u2_11=u2_10*d10+2*u1_10,u2_12=u2_11*d11+2*u1_11,
-                   u2_13=u2_12*d12+2*u1_12,u2_14=u2_13*d13+2*u1_13,
-                   u2_15=u2_14*d14+2*u1_14;
-      const double v2_12=d13*    2+2*v1_13,v2_11=d12*v2_12+2*v1_12,
-                   v2_10=d11*v2_11+2*v1_11,v2_09=d10*v2_10+2*v1_10,
-                   v2_08=d09*v2_09+2*v1_09,v2_07=d08*v2_08+2*v1_08,
-                   v2_06=d07*v2_07+2*v1_07,v2_05=d06*v2_06+2*v1_06,
-                   v2_04=d05*v2_05+2*v1_05,v2_03=d04*v2_04+2*v1_04,
-                   v2_02=d03*v2_03+2*v1_03,v2_01=d02*v2_02+2*v1_02,
-                   v2_00=d01*v2_01+2*v1_01;
-      p[2*16+ 0]=4*w[ 0]*(                         +    1*v2_00);
-      p[2*16+ 1]=4*w[ 1]*(           +2*    1*v1_01+u0_01*v2_01);
-      p[2*16+ 2]=4*w[ 2]*(    2*v0_02+2*u1_02*v1_02+u0_02*v2_02);
-      p[2*16+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03);
-      p[2*16+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04);
-      p[2*16+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05);
-      p[2*16+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06);
-      p[2*16+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07);
-      p[2*16+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08);
-      p[2*16+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*v2_09);
-      p[2*16+10]=4*w[10]*(u2_10*v0_10+2*u1_10*v1_10+u0_10*v2_10);
-      p[2*16+11]=4*w[11]*(u2_11*v0_11+2*u1_11*v1_11+u0_11*v2_11);
-      p[2*16+12]=4*w[12]*(u2_12*v0_12+2*u1_12*v1_12+u0_12*v2_12);
-      p[2*16+13]=4*w[13]*(u2_13*v0_13+2*u1_13*v1_13+u0_13*    2);
-      p[2*16+14]=4*w[14]*(u2_14*v0_14+2*u1_14*    1            );
-      p[2*16+15]=4*w[15]*(u2_15*    1                          );
-    }
-  }
-}
-
-static const double gllz_17[ 7] = {
-  0.97313217663141831415697950187372143058895914912251,
-  0.91087999591557359562380250639772646753087945186873,
-  0.81569625122177030710675055323752665471640239706712,
-  0.69102898062768470539491935737245329680641306219042,
-  0.54138539933010153912373340750406325167514664796483,
-  0.37217443356547704190723468073525781255981731440028,
-  0.1895119735183173883042630147531139713449924229225
-};
-
-static void gll_lag_17(double *restrict p, double *restrict w,
-                       unsigned n, int d, double xh)
-{
-  const double x = xh*2;
-  const double d00=x+2            ,d01=x+2*gllz_17[ 0],d02=x+2*gllz_17[ 1],
-               d03=x+2*gllz_17[ 2],d04=x+2*gllz_17[ 3],d05=x+2*gllz_17[ 4],
-               d06=x+2*gllz_17[ 5],d07=x+2*gllz_17[ 6],d08=x              ,
-               d09=x-2*gllz_17[ 6],d10=x-2*gllz_17[ 5],d11=x-2*gllz_17[ 4],
-               d12=x-2*gllz_17[ 3],d13=x-2*gllz_17[ 2],d14=x-2*gllz_17[ 1],
-               d15=x-2*gllz_17[ 0],d16=x-2            ;
-  const double u0_01=    1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02,
-               u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05,
-               u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08,
-               u0_10=u0_09*d09,u0_11=u0_10*d10,u0_12=u0_11*d11,
-               u0_13=u0_12*d12,u0_14=u0_13*d13,u0_15=u0_14*d14,
-               u0_16=u0_15*d15;
-  const double v0_15=d16*    1,v0_14=d15*v0_15,v0_13=d14*v0_14,
-               v0_12=d13*v0_13,v0_11=d12*v0_12,v0_10=d11*v0_11,
-               v0_09=d10*v0_10,v0_08=d09*v0_09,v0_07=d08*v0_08,
-               v0_06=d07*v0_07,v0_05=d06*v0_06,v0_04=d05*v0_05,
-               v0_03=d04*v0_04,v0_02=d03*v0_03,v0_01=d02*v0_02,
-               v0_00=d01*v0_01;
-  p[ 0]=w[ 0]*    1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02;
-  p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05;
-  p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08;
-  p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*v0_11;
-  p[12]=w[12]*u0_12*v0_12; p[13]=w[13]*u0_13*v0_13; p[14]=w[14]*u0_14*v0_14;
-  p[15]=w[15]*u0_15*v0_15; p[16]=w[16]*u0_16*    1;
-  if(d>0) {
-    const double u1_02=    1*d01+u0_01,u1_03=u1_02*d02+u0_02,
-                 u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04,
-                 u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06,
-                 u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08,
-                 u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10,
-                 u1_12=u1_11*d11+u0_11,u1_13=u1_12*d12+u0_12,
-                 u1_14=u1_13*d13+u0_13,u1_15=u1_14*d14+u0_14,
-                 u1_16=u1_15*d15+u0_15;
-    const double v1_14=d15*    1+v0_15,v1_13=d14*v1_14+v0_14,
-                 v1_12=d13*v1_13+v0_13,v1_11=d12*v1_12+v0_12,
-                 v1_10=d11*v1_11+v0_11,v1_09=d10*v1_10+v0_10,
-                 v1_08=d09*v1_09+v0_09,v1_07=d08*v1_08+v0_08,
-                 v1_06=d07*v1_07+v0_07,v1_05=d06*v1_06+v0_06,
-                 v1_04=d05*v1_05+v0_05,v1_03=d04*v1_04+v0_04,
-                 v1_02=d03*v1_03+v0_03,v1_01=d02*v1_02+v0_02,
-                 v1_00=d01*v1_01+v0_01;
-    p[17+ 0]=2*w[ 0]*(                  v1_00);
-    p[17+ 1]=2*w[ 1]*(    1*v0_01+u0_01*v1_01);
-    p[17+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02);
-    p[17+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03);
-    p[17+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04);
-    p[17+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05);
-    p[17+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06);
-    p[17+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07);
-    p[17+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08);
-    p[17+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09);
-    p[17+10]=2*w[10]*(u1_10*v0_10+u0_10*v1_10);
-    p[17+11]=2*w[11]*(u1_11*v0_11+u0_11*v1_11);
-    p[17+12]=2*w[12]*(u1_12*v0_12+u0_12*v1_12);
-    p[17+13]=2*w[13]*(u1_13*v0_13+u0_13*v1_13);
-    p[17+14]=2*w[14]*(u1_14*v0_14+u0_14*v1_14);
-    p[17+15]=2*w[15]*(u1_15*v0_15+u0_15*    1);
-    p[17+16]=2*w[16]*(u1_16                  );
-    if(d>1) {
-      const double u2_03=    2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03,
-                   u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05,
-                   u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07,
-                   u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09,
-                   u2_11=u2_10*d10+2*u1_10,u2_12=u2_11*d11+2*u1_11,
-                   u2_13=u2_12*d12+2*u1_12,u2_14=u2_13*d13+2*u1_13,
-                   u2_15=u2_14*d14+2*u1_14,u2_16=u2_15*d15+2*u1_15;
-      const double v2_13=d14*    2+2*v1_14,v2_12=d13*v2_13+2*v1_13,
-                   v2_11=d12*v2_12+2*v1_12,v2_10=d11*v2_11+2*v1_11,
-                   v2_09=d10*v2_10+2*v1_10,v2_08=d09*v2_09+2*v1_09,
-                   v2_07=d08*v2_08+2*v1_08,v2_06=d07*v2_07+2*v1_07,
-                   v2_05=d06*v2_06+2*v1_06,v2_04=d05*v2_05+2*v1_05,
-                   v2_03=d04*v2_04+2*v1_04,v2_02=d03*v2_03+2*v1_03,
-                   v2_01=d02*v2_02+2*v1_02,v2_00=d01*v2_01+2*v1_01;
-      p[2*17+ 0]=4*w[ 0]*(                         +    1*v2_00);
-      p[2*17+ 1]=4*w[ 1]*(           +2*    1*v1_01+u0_01*v2_01);
-      p[2*17+ 2]=4*w[ 2]*(    2*v0_02+2*u1_02*v1_02+u0_02*v2_02);
-      p[2*17+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03);
-      p[2*17+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04);
-      p[2*17+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05);
-      p[2*17+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06);
-      p[2*17+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07);
-      p[2*17+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08);
-      p[2*17+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*v2_09);
-      p[2*17+10]=4*w[10]*(u2_10*v0_10+2*u1_10*v1_10+u0_10*v2_10);
-      p[2*17+11]=4*w[11]*(u2_11*v0_11+2*u1_11*v1_11+u0_11*v2_11);
-      p[2*17+12]=4*w[12]*(u2_12*v0_12+2*u1_12*v1_12+u0_12*v2_12);
-      p[2*17+13]=4*w[13]*(u2_13*v0_13+2*u1_13*v1_13+u0_13*v2_13);
-      p[2*17+14]=4*w[14]*(u2_14*v0_14+2*u1_14*v1_14+u0_14*    2);
-      p[2*17+15]=4*w[15]*(u2_15*v0_15+2*u1_15*    1            );
-      p[2*17+16]=4*w[16]*(u2_16*    1                          );
-    }
-  }
-}
-
-static const double gllz_18[ 8] = {
-  0.97610555741219854286451892434170006676181344271919,
-  0.92064918534753387383785462543127742356235348618904,
-  0.83559353521809021371364636232793725743367075916582,
-  0.72367932928324268130621036530207067914952520415476,
-  0.58850483431866176117353589319355946900083678931622,
-  0.43441503691212397534228713674067479584975844516369,
-  0.26636265287828098416766533202559594206513618931826,
-  0.089749093484652111022645010088561734960603901041125
-};
-
-static void gll_lag_18(double *restrict p, double *restrict w,
-                       unsigned n, int d, double xh)
-{
-  const double x = xh*2;
-  const double d00=x+2            ,d01=x+2*gllz_18[ 0],d02=x+2*gllz_18[ 1],
-               d03=x+2*gllz_18[ 2],d04=x+2*gllz_18[ 3],d05=x+2*gllz_18[ 4],
-               d06=x+2*gllz_18[ 5],d07=x+2*gllz_18[ 6],d08=x+2*gllz_18[ 7],
-               d09=x-2*gllz_18[ 7],d10=x-2*gllz_18[ 6],d11=x-2*gllz_18[ 5],
-               d12=x-2*gllz_18[ 4],d13=x-2*gllz_18[ 3],d14=x-2*gllz_18[ 2],
-               d15=x-2*gllz_18[ 1],d16=x-2*gllz_18[ 0],d17=x-2            ;
-  const double u0_01=    1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02,
-               u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05,
-               u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08,
-               u0_10=u0_09*d09,u0_11=u0_10*d10,u0_12=u0_11*d11,
-               u0_13=u0_12*d12,u0_14=u0_13*d13,u0_15=u0_14*d14,
-               u0_16=u0_15*d15,u0_17=u0_16*d16;
-  const double v0_16=d17*    1,v0_15=d16*v0_16,v0_14=d15*v0_15,
-               v0_13=d14*v0_14,v0_12=d13*v0_13,v0_11=d12*v0_12,
-               v0_10=d11*v0_11,v0_09=d10*v0_10,v0_08=d09*v0_09,
-               v0_07=d08*v0_08,v0_06=d07*v0_07,v0_05=d06*v0_06,
-               v0_04=d05*v0_05,v0_03=d04*v0_04,v0_02=d03*v0_03,
-               v0_01=d02*v0_02,v0_00=d01*v0_01;
-  p[ 0]=w[ 0]*    1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02;
-  p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05;
-  p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08;
-  p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*v0_11;
-  p[12]=w[12]*u0_12*v0_12; p[13]=w[13]*u0_13*v0_13; p[14]=w[14]*u0_14*v0_14;
-  p[15]=w[15]*u0_15*v0_15; p[16]=w[16]*u0_16*v0_16; p[17]=w[17]*u0_17*    1;
-  if(d>0) {
-    const double u1_02=    1*d01+u0_01,u1_03=u1_02*d02+u0_02,
-                 u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04,
-                 u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06,
-                 u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08,
-                 u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10,
-                 u1_12=u1_11*d11+u0_11,u1_13=u1_12*d12+u0_12,
-                 u1_14=u1_13*d13+u0_13,u1_15=u1_14*d14+u0_14,
-                 u1_16=u1_15*d15+u0_15,u1_17=u1_16*d16+u0_16;
-    const double v1_15=d16*    1+v0_16,v1_14=d15*v1_15+v0_15,
-                 v1_13=d14*v1_14+v0_14,v1_12=d13*v1_13+v0_13,
-                 v1_11=d12*v1_12+v0_12,v1_10=d11*v1_11+v0_11,
-                 v1_09=d10*v1_10+v0_10,v1_08=d09*v1_09+v0_09,
-                 v1_07=d08*v1_08+v0_08,v1_06=d07*v1_07+v0_07,
-                 v1_05=d06*v1_06+v0_06,v1_04=d05*v1_05+v0_05,
-                 v1_03=d04*v1_04+v0_04,v1_02=d03*v1_03+v0_03,
-                 v1_01=d02*v1_02+v0_02,v1_00=d01*v1_01+v0_01;
-    p[18+ 0]=2*w[ 0]*(                  v1_00);
-    p[18+ 1]=2*w[ 1]*(    1*v0_01+u0_01*v1_01);
-    p[18+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02);
-    p[18+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03);
-    p[18+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04);
-    p[18+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05);
-    p[18+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06);
-    p[18+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07);
-    p[18+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08);
-    p[18+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09);
-    p[18+10]=2*w[10]*(u1_10*v0_10+u0_10*v1_10);
-    p[18+11]=2*w[11]*(u1_11*v0_11+u0_11*v1_11);
-    p[18+12]=2*w[12]*(u1_12*v0_12+u0_12*v1_12);
-    p[18+13]=2*w[13]*(u1_13*v0_13+u0_13*v1_13);
-    p[18+14]=2*w[14]*(u1_14*v0_14+u0_14*v1_14);
-    p[18+15]=2*w[15]*(u1_15*v0_15+u0_15*v1_15);
-    p[18+16]=2*w[16]*(u1_16*v0_16+u0_16*    1);
-    p[18+17]=2*w[17]*(u1_17                  );
-    if(d>1) {
-      const double u2_03=    2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03,
-                   u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05,
-                   u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07,
-                   u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09,
-                   u2_11=u2_10*d10+2*u1_10,u2_12=u2_11*d11+2*u1_11,
-                   u2_13=u2_12*d12+2*u1_12,u2_14=u2_13*d13+2*u1_13,
-                   u2_15=u2_14*d14+2*u1_14,u2_16=u2_15*d15+2*u1_15,
-                   u2_17=u2_16*d16+2*u1_16;
-      const double v2_14=d15*    2+2*v1_15,v2_13=d14*v2_14+2*v1_14,
-                   v2_12=d13*v2_13+2*v1_13,v2_11=d12*v2_12+2*v1_12,
-                   v2_10=d11*v2_11+2*v1_11,v2_09=d10*v2_10+2*v1_10,
-                   v2_08=d09*v2_09+2*v1_09,v2_07=d08*v2_08+2*v1_08,
-                   v2_06=d07*v2_07+2*v1_07,v2_05=d06*v2_06+2*v1_06,
-                   v2_04=d05*v2_05+2*v1_05,v2_03=d04*v2_04+2*v1_04,
-                   v2_02=d03*v2_03+2*v1_03,v2_01=d02*v2_02+2*v1_02,
-                   v2_00=d01*v2_01+2*v1_01;
-      p[2*18+ 0]=4*w[ 0]*(                         +    1*v2_00);
-      p[2*18+ 1]=4*w[ 1]*(           +2*    1*v1_01+u0_01*v2_01);
-      p[2*18+ 2]=4*w[ 2]*(    2*v0_02+2*u1_02*v1_02+u0_02*v2_02);
-      p[2*18+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03);
-      p[2*18+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04);
-      p[2*18+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05);
-      p[2*18+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06);
-      p[2*18+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07);
-      p[2*18+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08);
-      p[2*18+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*v2_09);
-      p[2*18+10]=4*w[10]*(u2_10*v0_10+2*u1_10*v1_10+u0_10*v2_10);
-      p[2*18+11]=4*w[11]*(u2_11*v0_11+2*u1_11*v1_11+u0_11*v2_11);
-      p[2*18+12]=4*w[12]*(u2_12*v0_12+2*u1_12*v1_12+u0_12*v2_12);
-      p[2*18+13]=4*w[13]*(u2_13*v0_13+2*u1_13*v1_13+u0_13*v2_13);
-      p[2*18+14]=4*w[14]*(u2_14*v0_14+2*u1_14*v1_14+u0_14*v2_14);
-      p[2*18+15]=4*w[15]*(u2_15*v0_15+2*u1_15*v1_15+u0_15*    2);
-      p[2*18+16]=4*w[16]*(u2_16*v0_16+2*u1_16*    1            );
-      p[2*18+17]=4*w[17]*(u2_17*    1                          );
-    }
-  }
-}
-
-static const double gllz_19[ 8] = {
-  0.97861176622208009515263406311022256281427733781081,
-  0.92890152815258624371794025879654861245016818225195,
-  0.85246057779664609308595597004106262523709538083887,
-  0.7514942025526130141636374896339440404036593556658,
-  0.62890813726522049776683230622873254706861115718956,
-  0.48822928568071350277790963762492336977121559965148,
-  0.33350484782449861029850010384492701192296337547773,
-  0.16918602340928157137515415344488042375289555076585
-};
-
-static void gll_lag_19(double *restrict p, double *restrict w,
-                       unsigned n, int d, double xh)
-{
-  const double x = xh*2;
-  const double d00=x+2            ,d01=x+2*gllz_19[ 0],d02=x+2*gllz_19[ 1],
-               d03=x+2*gllz_19[ 2],d04=x+2*gllz_19[ 3],d05=x+2*gllz_19[ 4],
-               d06=x+2*gllz_19[ 5],d07=x+2*gllz_19[ 6],d08=x+2*gllz_19[ 7],
-               d09=x              ,d10=x-2*gllz_19[ 7],d11=x-2*gllz_19[ 6],
-               d12=x-2*gllz_19[ 5],d13=x-2*gllz_19[ 4],d14=x-2*gllz_19[ 3],
-               d15=x-2*gllz_19[ 2],d16=x-2*gllz_19[ 1],d17=x-2*gllz_19[ 0],
-               d18=x-2            ;
-  const double u0_01=    1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02,
-               u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05,
-               u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08,
-               u0_10=u0_09*d09,u0_11=u0_10*d10,u0_12=u0_11*d11,
-               u0_13=u0_12*d12,u0_14=u0_13*d13,u0_15=u0_14*d14,
-               u0_16=u0_15*d15,u0_17=u0_16*d16,u0_18=u0_17*d17;
-  const double v0_17=d18*    1,v0_16=d17*v0_17,v0_15=d16*v0_16,
-               v0_14=d15*v0_15,v0_13=d14*v0_14,v0_12=d13*v0_13,
-               v0_11=d12*v0_12,v0_10=d11*v0_11,v0_09=d10*v0_10,
-               v0_08=d09*v0_09,v0_07=d08*v0_08,v0_06=d07*v0_07,
-               v0_05=d06*v0_06,v0_04=d05*v0_05,v0_03=d04*v0_04,
-               v0_02=d03*v0_03,v0_01=d02*v0_02,v0_00=d01*v0_01;
-  p[ 0]=w[ 0]*    1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02;
-  p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05;
-  p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08;
-  p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*v0_11;
-  p[12]=w[12]*u0_12*v0_12; p[13]=w[13]*u0_13*v0_13; p[14]=w[14]*u0_14*v0_14;
-  p[15]=w[15]*u0_15*v0_15; p[16]=w[16]*u0_16*v0_16; p[17]=w[17]*u0_17*v0_17;
-  p[18]=w[18]*u0_18*    1;
-  if(d>0) {
-    const double u1_02=    1*d01+u0_01,u1_03=u1_02*d02+u0_02,
-                 u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04,
-                 u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06,
-                 u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08,
-                 u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10,
-                 u1_12=u1_11*d11+u0_11,u1_13=u1_12*d12+u0_12,
-                 u1_14=u1_13*d13+u0_13,u1_15=u1_14*d14+u0_14,
-                 u1_16=u1_15*d15+u0_15,u1_17=u1_16*d16+u0_16,
-                 u1_18=u1_17*d17+u0_17;
-    const double v1_16=d17*    1+v0_17,v1_15=d16*v1_16+v0_16,
-                 v1_14=d15*v1_15+v0_15,v1_13=d14*v1_14+v0_14,
-                 v1_12=d13*v1_13+v0_13,v1_11=d12*v1_12+v0_12,
-                 v1_10=d11*v1_11+v0_11,v1_09=d10*v1_10+v0_10,
-                 v1_08=d09*v1_09+v0_09,v1_07=d08*v1_08+v0_08,
-                 v1_06=d07*v1_07+v0_07,v1_05=d06*v1_06+v0_06,
-                 v1_04=d05*v1_05+v0_05,v1_03=d04*v1_04+v0_04,
-                 v1_02=d03*v1_03+v0_03,v1_01=d02*v1_02+v0_02,
-                 v1_00=d01*v1_01+v0_01;
-    p[19+ 0]=2*w[ 0]*(                  v1_00);
-    p[19+ 1]=2*w[ 1]*(    1*v0_01+u0_01*v1_01);
-    p[19+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02);
-    p[19+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03);
-    p[19+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04);
-    p[19+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05);
-    p[19+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06);
-    p[19+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07);
-    p[19+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08);
-    p[19+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09);
-    p[19+10]=2*w[10]*(u1_10*v0_10+u0_10*v1_10);
-    p[19+11]=2*w[11]*(u1_11*v0_11+u0_11*v1_11);
-    p[19+12]=2*w[12]*(u1_12*v0_12+u0_12*v1_12);
-    p[19+13]=2*w[13]*(u1_13*v0_13+u0_13*v1_13);
-    p[19+14]=2*w[14]*(u1_14*v0_14+u0_14*v1_14);
-    p[19+15]=2*w[15]*(u1_15*v0_15+u0_15*v1_15);
-    p[19+16]=2*w[16]*(u1_16*v0_16+u0_16*v1_16);
-    p[19+17]=2*w[17]*(u1_17*v0_17+u0_17*    1);
-    p[19+18]=2*w[18]*(u1_18                  );
-    if(d>1) {
-      const double u2_03=    2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03,
-                   u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05,
-                   u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07,
-                   u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09,
-                   u2_11=u2_10*d10+2*u1_10,u2_12=u2_11*d11+2*u1_11,
-                   u2_13=u2_12*d12+2*u1_12,u2_14=u2_13*d13+2*u1_13,
-                   u2_15=u2_14*d14+2*u1_14,u2_16=u2_15*d15+2*u1_15,
-                   u2_17=u2_16*d16+2*u1_16,u2_18=u2_17*d17+2*u1_17;
-      const double v2_15=d16*    2+2*v1_16,v2_14=d15*v2_15+2*v1_15,
-                   v2_13=d14*v2_14+2*v1_14,v2_12=d13*v2_13+2*v1_13,
-                   v2_11=d12*v2_12+2*v1_12,v2_10=d11*v2_11+2*v1_11,
-                   v2_09=d10*v2_10+2*v1_10,v2_08=d09*v2_09+2*v1_09,
-                   v2_07=d08*v2_08+2*v1_08,v2_06=d07*v2_07+2*v1_07,
-                   v2_05=d06*v2_06+2*v1_06,v2_04=d05*v2_05+2*v1_05,
-                   v2_03=d04*v2_04+2*v1_04,v2_02=d03*v2_03+2*v1_03,
-                   v2_01=d02*v2_02+2*v1_02,v2_00=d01*v2_01+2*v1_01;
-      p[2*19+ 0]=4*w[ 0]*(                         +    1*v2_00);
-      p[2*19+ 1]=4*w[ 1]*(           +2*    1*v1_01+u0_01*v2_01);
-      p[2*19+ 2]=4*w[ 2]*(    2*v0_02+2*u1_02*v1_02+u0_02*v2_02);
-      p[2*19+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03);
-      p[2*19+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04);
-      p[2*19+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05);
-      p[2*19+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06);
-      p[2*19+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07);
-      p[2*19+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08);
-      p[2*19+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*v2_09);
-      p[2*19+10]=4*w[10]*(u2_10*v0_10+2*u1_10*v1_10+u0_10*v2_10);
-      p[2*19+11]=4*w[11]*(u2_11*v0_11+2*u1_11*v1_11+u0_11*v2_11);
-      p[2*19+12]=4*w[12]*(u2_12*v0_12+2*u1_12*v1_12+u0_12*v2_12);
-      p[2*19+13]=4*w[13]*(u2_13*v0_13+2*u1_13*v1_13+u0_13*v2_13);
-      p[2*19+14]=4*w[14]*(u2_14*v0_14+2*u1_14*v1_14+u0_14*v2_14);
-      p[2*19+15]=4*w[15]*(u2_15*v0_15+2*u1_15*v1_15+u0_15*v2_15);
-      p[2*19+16]=4*w[16]*(u2_16*v0_16+2*u1_16*v1_16+u0_16*    2);
-      p[2*19+17]=4*w[17]*(u2_17*v0_17+2*u1_17*    1            );
-      p[2*19+18]=4*w[18]*(u2_18*    1                          );
-    }
-  }
-}
-
-static const double gllz_20[ 9] = {
-  0.98074370489391417192544643858423091522991062312625,
-  0.93593449881266543571618158493062692991557383318105,
-  0.86687797808995014130984721461628521396291128831699,
-  0.77536826095205587041431752759469134337272185947653,
-  0.66377640229031128984640332297115885247574574199149,
-  0.53499286403188626164813596182898398300685156913752,
-  0.39235318371390929938647470381582436666520332929891,
-  0.23955170592298649518240135692708807194151780992738,
-  0.080545937238821837975944518159554463022392870092908
-};
-
-static void gll_lag_20(double *restrict p, double *restrict w,
-                       unsigned n, int d, double xh)
-{
-  const double x = xh*2;
-  const double d00=x+2            ,d01=x+2*gllz_20[ 0],d02=x+2*gllz_20[ 1],
-               d03=x+2*gllz_20[ 2],d04=x+2*gllz_20[ 3],d05=x+2*gllz_20[ 4],
-               d06=x+2*gllz_20[ 5],d07=x+2*gllz_20[ 6],d08=x+2*gllz_20[ 7],
-               d09=x+2*gllz_20[ 8],d10=x-2*gllz_20[ 8],d11=x-2*gllz_20[ 7],
-               d12=x-2*gllz_20[ 6],d13=x-2*gllz_20[ 5],d14=x-2*gllz_20[ 4],
-               d15=x-2*gllz_20[ 3],d16=x-2*gllz_20[ 2],d17=x-2*gllz_20[ 1],
-               d18=x-2*gllz_20[ 0],d19=x-2            ;
-  const double u0_01=    1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02,
-               u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05,
-               u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08,
-               u0_10=u0_09*d09,u0_11=u0_10*d10,u0_12=u0_11*d11,
-               u0_13=u0_12*d12,u0_14=u0_13*d13,u0_15=u0_14*d14,
-               u0_16=u0_15*d15,u0_17=u0_16*d16,u0_18=u0_17*d17,
-               u0_19=u0_18*d18;
-  const double v0_18=d19*    1,v0_17=d18*v0_18,v0_16=d17*v0_17,
-               v0_15=d16*v0_16,v0_14=d15*v0_15,v0_13=d14*v0_14,
-               v0_12=d13*v0_13,v0_11=d12*v0_12,v0_10=d11*v0_11,
-               v0_09=d10*v0_10,v0_08=d09*v0_09,v0_07=d08*v0_08,
-               v0_06=d07*v0_07,v0_05=d06*v0_06,v0_04=d05*v0_05,
-               v0_03=d04*v0_04,v0_02=d03*v0_03,v0_01=d02*v0_02,
-               v0_00=d01*v0_01;
-  p[ 0]=w[ 0]*    1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02;
-  p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05;
-  p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08;
-  p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*v0_11;
-  p[12]=w[12]*u0_12*v0_12; p[13]=w[13]*u0_13*v0_13; p[14]=w[14]*u0_14*v0_14;
-  p[15]=w[15]*u0_15*v0_15; p[16]=w[16]*u0_16*v0_16; p[17]=w[17]*u0_17*v0_17;
-  p[18]=w[18]*u0_18*v0_18; p[19]=w[19]*u0_19*    1;
-  if(d>0) {
-    const double u1_02=    1*d01+u0_01,u1_03=u1_02*d02+u0_02,
-                 u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04,
-                 u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06,
-                 u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08,
-                 u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10,
-                 u1_12=u1_11*d11+u0_11,u1_13=u1_12*d12+u0_12,
-                 u1_14=u1_13*d13+u0_13,u1_15=u1_14*d14+u0_14,
-                 u1_16=u1_15*d15+u0_15,u1_17=u1_16*d16+u0_16,
-                 u1_18=u1_17*d17+u0_17,u1_19=u1_18*d18+u0_18;
-    const double v1_17=d18*    1+v0_18,v1_16=d17*v1_17+v0_17,
-                 v1_15=d16*v1_16+v0_16,v1_14=d15*v1_15+v0_15,
-                 v1_13=d14*v1_14+v0_14,v1_12=d13*v1_13+v0_13,
-                 v1_11=d12*v1_12+v0_12,v1_10=d11*v1_11+v0_11,
-                 v1_09=d10*v1_10+v0_10,v1_08=d09*v1_09+v0_09,
-                 v1_07=d08*v1_08+v0_08,v1_06=d07*v1_07+v0_07,
-                 v1_05=d06*v1_06+v0_06,v1_04=d05*v1_05+v0_05,
-                 v1_03=d04*v1_04+v0_04,v1_02=d03*v1_03+v0_03,
-                 v1_01=d02*v1_02+v0_02,v1_00=d01*v1_01+v0_01;
-    p[20+ 0]=2*w[ 0]*(                  v1_00);
-    p[20+ 1]=2*w[ 1]*(    1*v0_01+u0_01*v1_01);
-    p[20+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02);
-    p[20+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03);
-    p[20+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04);
-    p[20+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05);
-    p[20+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06);
-    p[20+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07);
-    p[20+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08);
-    p[20+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09);
-    p[20+10]=2*w[10]*(u1_10*v0_10+u0_10*v1_10);
-    p[20+11]=2*w[11]*(u1_11*v0_11+u0_11*v1_11);
-    p[20+12]=2*w[12]*(u1_12*v0_12+u0_12*v1_12);
-    p[20+13]=2*w[13]*(u1_13*v0_13+u0_13*v1_13);
-    p[20+14]=2*w[14]*(u1_14*v0_14+u0_14*v1_14);
-    p[20+15]=2*w[15]*(u1_15*v0_15+u0_15*v1_15);
-    p[20+16]=2*w[16]*(u1_16*v0_16+u0_16*v1_16);
-    p[20+17]=2*w[17]*(u1_17*v0_17+u0_17*v1_17);
-    p[20+18]=2*w[18]*(u1_18*v0_18+u0_18*    1);
-    p[20+19]=2*w[19]*(u1_19                  );
-    if(d>1) {
-      const double u2_03=    2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03,
-                   u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05,
-                   u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07,
-                   u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09,
-                   u2_11=u2_10*d10+2*u1_10,u2_12=u2_11*d11+2*u1_11,
-                   u2_13=u2_12*d12+2*u1_12,u2_14=u2_13*d13+2*u1_13,
-                   u2_15=u2_14*d14+2*u1_14,u2_16=u2_15*d15+2*u1_15,
-                   u2_17=u2_16*d16+2*u1_16,u2_18=u2_17*d17+2*u1_17,
-                   u2_19=u2_18*d18+2*u1_18;
-      const double v2_16=d17*    2+2*v1_17,v2_15=d16*v2_16+2*v1_16,
-                   v2_14=d15*v2_15+2*v1_15,v2_13=d14*v2_14+2*v1_14,
-                   v2_12=d13*v2_13+2*v1_13,v2_11=d12*v2_12+2*v1_12,
-                   v2_10=d11*v2_11+2*v1_11,v2_09=d10*v2_10+2*v1_10,
-                   v2_08=d09*v2_09+2*v1_09,v2_07=d08*v2_08+2*v1_08,
-                   v2_06=d07*v2_07+2*v1_07,v2_05=d06*v2_06+2*v1_06,
-                   v2_04=d05*v2_05+2*v1_05,v2_03=d04*v2_04+2*v1_04,
-                   v2_02=d03*v2_03+2*v1_03,v2_01=d02*v2_02+2*v1_02,
-                   v2_00=d01*v2_01+2*v1_01;
-      p[2*20+ 0]=4*w[ 0]*(                         +    1*v2_00);
-      p[2*20+ 1]=4*w[ 1]*(           +2*    1*v1_01+u0_01*v2_01);
-      p[2*20+ 2]=4*w[ 2]*(    2*v0_02+2*u1_02*v1_02+u0_02*v2_02);
-      p[2*20+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03);
-      p[2*20+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04);
-      p[2*20+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05);
-      p[2*20+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06);
-      p[2*20+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07);
-      p[2*20+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08);
-      p[2*20+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*v2_09);
-      p[2*20+10]=4*w[10]*(u2_10*v0_10+2*u1_10*v1_10+u0_10*v2_10);
-      p[2*20+11]=4*w[11]*(u2_11*v0_11+2*u1_11*v1_11+u0_11*v2_11);
-      p[2*20+12]=4*w[12]*(u2_12*v0_12+2*u1_12*v1_12+u0_12*v2_12);
-      p[2*20+13]=4*w[13]*(u2_13*v0_13+2*u1_13*v1_13+u0_13*v2_13);
-      p[2*20+14]=4*w[14]*(u2_14*v0_14+2*u1_14*v1_14+u0_14*v2_14);
-      p[2*20+15]=4*w[15]*(u2_15*v0_15+2*u1_15*v1_15+u0_15*v2_15);
-      p[2*20+16]=4*w[16]*(u2_16*v0_16+2*u1_16*v1_16+u0_16*v2_16);
-      p[2*20+17]=4*w[17]*(u2_17*v0_17+2*u1_17*v1_17+u0_17*    2);
-      p[2*20+18]=4*w[18]*(u2_18*v0_18+2*u1_18*    1            );
-      p[2*20+19]=4*w[19]*(u2_19*    1                          );
-    }
-  }
-}
-
-static const double gllz_21[ 9] = {
-  0.98257229660454802823448127655540587685917158823641,
-  0.94197629695974553429610265066143517664965087404401,
-  0.8792947553235904644511535963049440477105815515092,
-  0.79600192607771240474431258966035863909041966054978,
-  0.69405102606222323262731639319466662875771600610585,
-  0.57583196026183068692702187033808528733577300855848,
-  0.44411578327900210119451634960735128473505748656706,
-  0.30198985650876488727535186785875223202107103406039,
-  0.15278551580218546600635832848566943551774899331328
-};
-
-static void gll_lag_21(double *restrict p, double *restrict w,
-                       unsigned n, int d, double xh)
-{
-  const double x = xh*2;
-  const double d00=x+2            ,d01=x+2*gllz_21[ 0],d02=x+2*gllz_21[ 1],
-               d03=x+2*gllz_21[ 2],d04=x+2*gllz_21[ 3],d05=x+2*gllz_21[ 4],
-               d06=x+2*gllz_21[ 5],d07=x+2*gllz_21[ 6],d08=x+2*gllz_21[ 7],
-               d09=x+2*gllz_21[ 8],d10=x              ,d11=x-2*gllz_21[ 8],
-               d12=x-2*gllz_21[ 7],d13=x-2*gllz_21[ 6],d14=x-2*gllz_21[ 5],
-               d15=x-2*gllz_21[ 4],d16=x-2*gllz_21[ 3],d17=x-2*gllz_21[ 2],
-               d18=x-2*gllz_21[ 1],d19=x-2*gllz_21[ 0],d20=x-2            ;
-  const double u0_01=    1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02,
-               u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05,
-               u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08,
-               u0_10=u0_09*d09,u0_11=u0_10*d10,u0_12=u0_11*d11,
-               u0_13=u0_12*d12,u0_14=u0_13*d13,u0_15=u0_14*d14,
-               u0_16=u0_15*d15,u0_17=u0_16*d16,u0_18=u0_17*d17,
-               u0_19=u0_18*d18,u0_20=u0_19*d19;
-  const double v0_19=d20*    1,v0_18=d19*v0_19,v0_17=d18*v0_18,
-               v0_16=d17*v0_17,v0_15=d16*v0_16,v0_14=d15*v0_15,
-               v0_13=d14*v0_14,v0_12=d13*v0_13,v0_11=d12*v0_12,
-               v0_10=d11*v0_11,v0_09=d10*v0_10,v0_08=d09*v0_09,
-               v0_07=d08*v0_08,v0_06=d07*v0_07,v0_05=d06*v0_06,
-               v0_04=d05*v0_05,v0_03=d04*v0_04,v0_02=d03*v0_03,
-               v0_01=d02*v0_02,v0_00=d01*v0_01;
-  p[ 0]=w[ 0]*    1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02;
-  p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05;
-  p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08;
-  p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*v0_11;
-  p[12]=w[12]*u0_12*v0_12; p[13]=w[13]*u0_13*v0_13; p[14]=w[14]*u0_14*v0_14;
-  p[15]=w[15]*u0_15*v0_15; p[16]=w[16]*u0_16*v0_16; p[17]=w[17]*u0_17*v0_17;
-  p[18]=w[18]*u0_18*v0_18; p[19]=w[19]*u0_19*v0_19; p[20]=w[20]*u0_20*    1;
-  if(d>0) {
-    const double u1_02=    1*d01+u0_01,u1_03=u1_02*d02+u0_02,
-                 u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04,
-                 u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06,
-                 u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08,
-                 u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10,
-                 u1_12=u1_11*d11+u0_11,u1_13=u1_12*d12+u0_12,
-                 u1_14=u1_13*d13+u0_13,u1_15=u1_14*d14+u0_14,
-                 u1_16=u1_15*d15+u0_15,u1_17=u1_16*d16+u0_16,
-                 u1_18=u1_17*d17+u0_17,u1_19=u1_18*d18+u0_18,
-                 u1_20=u1_19*d19+u0_19;
-    const double v1_18=d19*    1+v0_19,v1_17=d18*v1_18+v0_18,
-                 v1_16=d17*v1_17+v0_17,v1_15=d16*v1_16+v0_16,
-                 v1_14=d15*v1_15+v0_15,v1_13=d14*v1_14+v0_14,
-                 v1_12=d13*v1_13+v0_13,v1_11=d12*v1_12+v0_12,
-                 v1_10=d11*v1_11+v0_11,v1_09=d10*v1_10+v0_10,
-                 v1_08=d09*v1_09+v0_09,v1_07=d08*v1_08+v0_08,
-                 v1_06=d07*v1_07+v0_07,v1_05=d06*v1_06+v0_06,
-                 v1_04=d05*v1_05+v0_05,v1_03=d04*v1_04+v0_04,
-                 v1_02=d03*v1_03+v0_03,v1_01=d02*v1_02+v0_02,
-                 v1_00=d01*v1_01+v0_01;
-    p[21+ 0]=2*w[ 0]*(                  v1_00);
-    p[21+ 1]=2*w[ 1]*(    1*v0_01+u0_01*v1_01);
-    p[21+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02);
-    p[21+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03);
-    p[21+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04);
-    p[21+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05);
-    p[21+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06);
-    p[21+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07);
-    p[21+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08);
-    p[21+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09);
-    p[21+10]=2*w[10]*(u1_10*v0_10+u0_10*v1_10);
-    p[21+11]=2*w[11]*(u1_11*v0_11+u0_11*v1_11);
-    p[21+12]=2*w[12]*(u1_12*v0_12+u0_12*v1_12);
-    p[21+13]=2*w[13]*(u1_13*v0_13+u0_13*v1_13);
-    p[21+14]=2*w[14]*(u1_14*v0_14+u0_14*v1_14);
-    p[21+15]=2*w[15]*(u1_15*v0_15+u0_15*v1_15);
-    p[21+16]=2*w[16]*(u1_16*v0_16+u0_16*v1_16);
-    p[21+17]=2*w[17]*(u1_17*v0_17+u0_17*v1_17);
-    p[21+18]=2*w[18]*(u1_18*v0_18+u0_18*v1_18);
-    p[21+19]=2*w[19]*(u1_19*v0_19+u0_19*    1);
-    p[21+20]=2*w[20]*(u1_20                  );
-    if(d>1) {
-      const double u2_03=    2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03,
-                   u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05,
-                   u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07,
-                   u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09,
-                   u2_11=u2_10*d10+2*u1_10,u2_12=u2_11*d11+2*u1_11,
-                   u2_13=u2_12*d12+2*u1_12,u2_14=u2_13*d13+2*u1_13,
-                   u2_15=u2_14*d14+2*u1_14,u2_16=u2_15*d15+2*u1_15,
-                   u2_17=u2_16*d16+2*u1_16,u2_18=u2_17*d17+2*u1_17,
-                   u2_19=u2_18*d18+2*u1_18,u2_20=u2_19*d19+2*u1_19;
-      const double v2_17=d18*    2+2*v1_18,v2_16=d17*v2_17+2*v1_17,
-                   v2_15=d16*v2_16+2*v1_16,v2_14=d15*v2_15+2*v1_15,
-                   v2_13=d14*v2_14+2*v1_14,v2_12=d13*v2_13+2*v1_13,
-                   v2_11=d12*v2_12+2*v1_12,v2_10=d11*v2_11+2*v1_11,
-                   v2_09=d10*v2_10+2*v1_10,v2_08=d09*v2_09+2*v1_09,
-                   v2_07=d08*v2_08+2*v1_08,v2_06=d07*v2_07+2*v1_07,
-                   v2_05=d06*v2_06+2*v1_06,v2_04=d05*v2_05+2*v1_05,
-                   v2_03=d04*v2_04+2*v1_04,v2_02=d03*v2_03+2*v1_03,
-                   v2_01=d02*v2_02+2*v1_02,v2_00=d01*v2_01+2*v1_01;
-      p[2*21+ 0]=4*w[ 0]*(                         +    1*v2_00);
-      p[2*21+ 1]=4*w[ 1]*(           +2*    1*v1_01+u0_01*v2_01);
-      p[2*21+ 2]=4*w[ 2]*(    2*v0_02+2*u1_02*v1_02+u0_02*v2_02);
-      p[2*21+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03);
-      p[2*21+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04);
-      p[2*21+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05);
-      p[2*21+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06);
-      p[2*21+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07);
-      p[2*21+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08);
-      p[2*21+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*v2_09);
-      p[2*21+10]=4*w[10]*(u2_10*v0_10+2*u1_10*v1_10+u0_10*v2_10);
-      p[2*21+11]=4*w[11]*(u2_11*v0_11+2*u1_11*v1_11+u0_11*v2_11);
-      p[2*21+12]=4*w[12]*(u2_12*v0_12+2*u1_12*v1_12+u0_12*v2_12);
-      p[2*21+13]=4*w[13]*(u2_13*v0_13+2*u1_13*v1_13+u0_13*v2_13);
-      p[2*21+14]=4*w[14]*(u2_14*v0_14+2*u1_14*v1_14+u0_14*v2_14);
-      p[2*21+15]=4*w[15]*(u2_15*v0_15+2*u1_15*v1_15+u0_15*v2_15);
-      p[2*21+16]=4*w[16]*(u2_16*v0_16+2*u1_16*v1_16+u0_16*v2_16);
-      p[2*21+17]=4*w[17]*(u2_17*v0_17+2*u1_17*v1_17+u0_17*v2_17);
-      p[2*21+18]=4*w[18]*(u2_18*v0_18+2*u1_18*v1_18+u0_18*    2);
-      p[2*21+19]=4*w[19]*(u2_19*v0_19+2*u1_19*    1            );
-      p[2*21+20]=4*w[20]*(u2_20*    1                          );
-    }
-  }
-}
-
-static const double gllz_22[10] = {
-  0.98415243845764617655228962221207029660551353611952,
-  0.94720428399922868052421376661572950991206204534136,
-  0.89006229019090447052965782577908679019953408284715,
-  0.8139489276119211360454418480561350424386685149071,
-  0.7204872399612021581198818963984657585933454261195,
-  0.6116694382842589712262116058699265993454403046077,
-  0.48981487518990234980875123568327004167127163579515,
-  0.35752071013891953806095728024017912928330710394294,
-  0.21760658515928504178795509346539276327500669401419,
-  0.073054540010898334761088790464107356192779236333516
-};
-
-static void gll_lag_22(double *restrict p, double *restrict w,
-                       unsigned n, int d, double xh)
-{
-  const double x = xh*2;
-  const double d00=x+2            ,d01=x+2*gllz_22[ 0],d02=x+2*gllz_22[ 1],
-               d03=x+2*gllz_22[ 2],d04=x+2*gllz_22[ 3],d05=x+2*gllz_22[ 4],
-               d06=x+2*gllz_22[ 5],d07=x+2*gllz_22[ 6],d08=x+2*gllz_22[ 7],
-               d09=x+2*gllz_22[ 8],d10=x+2*gllz_22[ 9],d11=x-2*gllz_22[ 9],
-               d12=x-2*gllz_22[ 8],d13=x-2*gllz_22[ 7],d14=x-2*gllz_22[ 6],
-               d15=x-2*gllz_22[ 5],d16=x-2*gllz_22[ 4],d17=x-2*gllz_22[ 3],
-               d18=x-2*gllz_22[ 2],d19=x-2*gllz_22[ 1],d20=x-2*gllz_22[ 0],
-               d21=x-2            ;
-  const double u0_01=    1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02,
-               u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05,
-               u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08,
-               u0_10=u0_09*d09,u0_11=u0_10*d10,u0_12=u0_11*d11,
-               u0_13=u0_12*d12,u0_14=u0_13*d13,u0_15=u0_14*d14,
-               u0_16=u0_15*d15,u0_17=u0_16*d16,u0_18=u0_17*d17,
-               u0_19=u0_18*d18,u0_20=u0_19*d19,u0_21=u0_20*d20;
-  const double v0_20=d21*    1,v0_19=d20*v0_20,v0_18=d19*v0_19,
-               v0_17=d18*v0_18,v0_16=d17*v0_17,v0_15=d16*v0_16,
-               v0_14=d15*v0_15,v0_13=d14*v0_14,v0_12=d13*v0_13,
-               v0_11=d12*v0_12,v0_10=d11*v0_11,v0_09=d10*v0_10,
-               v0_08=d09*v0_09,v0_07=d08*v0_08,v0_06=d07*v0_07,
-               v0_05=d06*v0_06,v0_04=d05*v0_05,v0_03=d04*v0_04,
-               v0_02=d03*v0_03,v0_01=d02*v0_02,v0_00=d01*v0_01;
-  p[ 0]=w[ 0]*    1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02;
-  p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05;
-  p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08;
-  p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*v0_11;
-  p[12]=w[12]*u0_12*v0_12; p[13]=w[13]*u0_13*v0_13; p[14]=w[14]*u0_14*v0_14;
-  p[15]=w[15]*u0_15*v0_15; p[16]=w[16]*u0_16*v0_16; p[17]=w[17]*u0_17*v0_17;
-  p[18]=w[18]*u0_18*v0_18; p[19]=w[19]*u0_19*v0_19; p[20]=w[20]*u0_20*v0_20;
-  p[21]=w[21]*u0_21*    1;
-  if(d>0) {
-    const double u1_02=    1*d01+u0_01,u1_03=u1_02*d02+u0_02,
-                 u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04,
-                 u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06,
-                 u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08,
-                 u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10,
-                 u1_12=u1_11*d11+u0_11,u1_13=u1_12*d12+u0_12,
-                 u1_14=u1_13*d13+u0_13,u1_15=u1_14*d14+u0_14,
-                 u1_16=u1_15*d15+u0_15,u1_17=u1_16*d16+u0_16,
-                 u1_18=u1_17*d17+u0_17,u1_19=u1_18*d18+u0_18,
-                 u1_20=u1_19*d19+u0_19,u1_21=u1_20*d20+u0_20;
-    const double v1_19=d20*    1+v0_20,v1_18=d19*v1_19+v0_19,
-                 v1_17=d18*v1_18+v0_18,v1_16=d17*v1_17+v0_17,
-                 v1_15=d16*v1_16+v0_16,v1_14=d15*v1_15+v0_15,
-                 v1_13=d14*v1_14+v0_14,v1_12=d13*v1_13+v0_13,
-                 v1_11=d12*v1_12+v0_12,v1_10=d11*v1_11+v0_11,
-                 v1_09=d10*v1_10+v0_10,v1_08=d09*v1_09+v0_09,
-                 v1_07=d08*v1_08+v0_08,v1_06=d07*v1_07+v0_07,
-                 v1_05=d06*v1_06+v0_06,v1_04=d05*v1_05+v0_05,
-                 v1_03=d04*v1_04+v0_04,v1_02=d03*v1_03+v0_03,
-                 v1_01=d02*v1_02+v0_02,v1_00=d01*v1_01+v0_01;
-    p[22+ 0]=2*w[ 0]*(                  v1_00);
-    p[22+ 1]=2*w[ 1]*(    1*v0_01+u0_01*v1_01);
-    p[22+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02);
-    p[22+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03);
-    p[22+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04);
-    p[22+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05);
-    p[22+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06);
-    p[22+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07);
-    p[22+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08);
-    p[22+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09);
-    p[22+10]=2*w[10]*(u1_10*v0_10+u0_10*v1_10);
-    p[22+11]=2*w[11]*(u1_11*v0_11+u0_11*v1_11);
-    p[22+12]=2*w[12]*(u1_12*v0_12+u0_12*v1_12);
-    p[22+13]=2*w[13]*(u1_13*v0_13+u0_13*v1_13);
-    p[22+14]=2*w[14]*(u1_14*v0_14+u0_14*v1_14);
-    p[22+15]=2*w[15]*(u1_15*v0_15+u0_15*v1_15);
-    p[22+16]=2*w[16]*(u1_16*v0_16+u0_16*v1_16);
-    p[22+17]=2*w[17]*(u1_17*v0_17+u0_17*v1_17);
-    p[22+18]=2*w[18]*(u1_18*v0_18+u0_18*v1_18);
-    p[22+19]=2*w[19]*(u1_19*v0_19+u0_19*v1_19);
-    p[22+20]=2*w[20]*(u1_20*v0_20+u0_20*    1);
-    p[22+21]=2*w[21]*(u1_21                  );
-    if(d>1) {
-      const double u2_03=    2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03,
-                   u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05,
-                   u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07,
-                   u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09,
-                   u2_11=u2_10*d10+2*u1_10,u2_12=u2_11*d11+2*u1_11,
-                   u2_13=u2_12*d12+2*u1_12,u2_14=u2_13*d13+2*u1_13,
-                   u2_15=u2_14*d14+2*u1_14,u2_16=u2_15*d15+2*u1_15,
-                   u2_17=u2_16*d16+2*u1_16,u2_18=u2_17*d17+2*u1_17,
-                   u2_19=u2_18*d18+2*u1_18,u2_20=u2_19*d19+2*u1_19,
-                   u2_21=u2_20*d20+2*u1_20;
-      const double v2_18=d19*    2+2*v1_19,v2_17=d18*v2_18+2*v1_18,
-                   v2_16=d17*v2_17+2*v1_17,v2_15=d16*v2_16+2*v1_16,
-                   v2_14=d15*v2_15+2*v1_15,v2_13=d14*v2_14+2*v1_14,
-                   v2_12=d13*v2_13+2*v1_13,v2_11=d12*v2_12+2*v1_12,
-                   v2_10=d11*v2_11+2*v1_11,v2_09=d10*v2_10+2*v1_10,
-                   v2_08=d09*v2_09+2*v1_09,v2_07=d08*v2_08+2*v1_08,
-                   v2_06=d07*v2_07+2*v1_07,v2_05=d06*v2_06+2*v1_06,
-                   v2_04=d05*v2_05+2*v1_05,v2_03=d04*v2_04+2*v1_04,
-                   v2_02=d03*v2_03+2*v1_03,v2_01=d02*v2_02+2*v1_02,
-                   v2_00=d01*v2_01+2*v1_01;
-      p[2*22+ 0]=4*w[ 0]*(                         +    1*v2_00);
-      p[2*22+ 1]=4*w[ 1]*(           +2*    1*v1_01+u0_01*v2_01);
-      p[2*22+ 2]=4*w[ 2]*(    2*v0_02+2*u1_02*v1_02+u0_02*v2_02);
-      p[2*22+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03);
-      p[2*22+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04);
-      p[2*22+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05);
-      p[2*22+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06);
-      p[2*22+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07);
-      p[2*22+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08);
-      p[2*22+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*v2_09);
-      p[2*22+10]=4*w[10]*(u2_10*v0_10+2*u1_10*v1_10+u0_10*v2_10);
-      p[2*22+11]=4*w[11]*(u2_11*v0_11+2*u1_11*v1_11+u0_11*v2_11);
-      p[2*22+12]=4*w[12]*(u2_12*v0_12+2*u1_12*v1_12+u0_12*v2_12);
-      p[2*22+13]=4*w[13]*(u2_13*v0_13+2*u1_13*v1_13+u0_13*v2_13);
-      p[2*22+14]=4*w[14]*(u2_14*v0_14+2*u1_14*v1_14+u0_14*v2_14);
-      p[2*22+15]=4*w[15]*(u2_15*v0_15+2*u1_15*v1_15+u0_15*v2_15);
-      p[2*22+16]=4*w[16]*(u2_16*v0_16+2*u1_16*v1_16+u0_16*v2_16);
-      p[2*22+17]=4*w[17]*(u2_17*v0_17+2*u1_17*v1_17+u0_17*v2_17);
-      p[2*22+18]=4*w[18]*(u2_18*v0_18+2*u1_18*v1_18+u0_18*v2_18);
-      p[2*22+19]=4*w[19]*(u2_19*v0_19+2*u1_19*v1_19+u0_19*    2);
-      p[2*22+20]=4*w[20]*(u2_20*v0_20+2*u1_20*    1            );
-      p[2*22+21]=4*w[21]*(u2_21*    1                          );
-    }
-  }
-}
-
-static const double gllz_23[10] = {
-  0.98552715587873257808146276673809909902061079213965,
-  0.9517579557107102041356396798514291558483519254488,
-  0.89945855804034501095016032034736715791179834813929,
-  0.82965109665128588622320061929000488459851188301333,
-  0.74369504117206068394516354306699679128721922895386,
-  0.6432636444601362084761455336027687438913118818023,
-  0.53031177113684416813011532015229981113034651492734,
-  0.40703793791447482919595048821509563955195372399417,
-  0.27584154894579306710687763267913520417319110660942,
-  0.13927620404066839859186261298276693390854445717444
-};
-
-static void gll_lag_23(double *restrict p, double *restrict w,
-                       unsigned n, int d, double xh)
-{
-  const double x = xh*2;
-  const double d00=x+2            ,d01=x+2*gllz_23[ 0],d02=x+2*gllz_23[ 1],
-               d03=x+2*gllz_23[ 2],d04=x+2*gllz_23[ 3],d05=x+2*gllz_23[ 4],
-               d06=x+2*gllz_23[ 5],d07=x+2*gllz_23[ 6],d08=x+2*gllz_23[ 7],
-               d09=x+2*gllz_23[ 8],d10=x+2*gllz_23[ 9],d11=x              ,
-               d12=x-2*gllz_23[ 9],d13=x-2*gllz_23[ 8],d14=x-2*gllz_23[ 7],
-               d15=x-2*gllz_23[ 6],d16=x-2*gllz_23[ 5],d17=x-2*gllz_23[ 4],
-               d18=x-2*gllz_23[ 3],d19=x-2*gllz_23[ 2],d20=x-2*gllz_23[ 1],
-               d21=x-2*gllz_23[ 0],d22=x-2            ;
-  const double u0_01=    1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02,
-               u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05,
-               u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08,
-               u0_10=u0_09*d09,u0_11=u0_10*d10,u0_12=u0_11*d11,
-               u0_13=u0_12*d12,u0_14=u0_13*d13,u0_15=u0_14*d14,
-               u0_16=u0_15*d15,u0_17=u0_16*d16,u0_18=u0_17*d17,
-               u0_19=u0_18*d18,u0_20=u0_19*d19,u0_21=u0_20*d20,
-               u0_22=u0_21*d21;
-  const double v0_21=d22*    1,v0_20=d21*v0_21,v0_19=d20*v0_20,
-               v0_18=d19*v0_19,v0_17=d18*v0_18,v0_16=d17*v0_17,
-               v0_15=d16*v0_16,v0_14=d15*v0_15,v0_13=d14*v0_14,
-               v0_12=d13*v0_13,v0_11=d12*v0_12,v0_10=d11*v0_11,
-               v0_09=d10*v0_10,v0_08=d09*v0_09,v0_07=d08*v0_08,
-               v0_06=d07*v0_07,v0_05=d06*v0_06,v0_04=d05*v0_05,
-               v0_03=d04*v0_04,v0_02=d03*v0_03,v0_01=d02*v0_02,
-               v0_00=d01*v0_01;
-  p[ 0]=w[ 0]*    1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02;
-  p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05;
-  p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08;
-  p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*v0_11;
-  p[12]=w[12]*u0_12*v0_12; p[13]=w[13]*u0_13*v0_13; p[14]=w[14]*u0_14*v0_14;
-  p[15]=w[15]*u0_15*v0_15; p[16]=w[16]*u0_16*v0_16; p[17]=w[17]*u0_17*v0_17;
-  p[18]=w[18]*u0_18*v0_18; p[19]=w[19]*u0_19*v0_19; p[20]=w[20]*u0_20*v0_20;
-  p[21]=w[21]*u0_21*v0_21; p[22]=w[22]*u0_22*    1;
-  if(d>0) {
-    const double u1_02=    1*d01+u0_01,u1_03=u1_02*d02+u0_02,
-                 u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04,
-                 u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06,
-                 u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08,
-                 u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10,
-                 u1_12=u1_11*d11+u0_11,u1_13=u1_12*d12+u0_12,
-                 u1_14=u1_13*d13+u0_13,u1_15=u1_14*d14+u0_14,
-                 u1_16=u1_15*d15+u0_15,u1_17=u1_16*d16+u0_16,
-                 u1_18=u1_17*d17+u0_17,u1_19=u1_18*d18+u0_18,
-                 u1_20=u1_19*d19+u0_19,u1_21=u1_20*d20+u0_20,
-                 u1_22=u1_21*d21+u0_21;
-    const double v1_20=d21*    1+v0_21,v1_19=d20*v1_20+v0_20,
-                 v1_18=d19*v1_19+v0_19,v1_17=d18*v1_18+v0_18,
-                 v1_16=d17*v1_17+v0_17,v1_15=d16*v1_16+v0_16,
-                 v1_14=d15*v1_15+v0_15,v1_13=d14*v1_14+v0_14,
-                 v1_12=d13*v1_13+v0_13,v1_11=d12*v1_12+v0_12,
-                 v1_10=d11*v1_11+v0_11,v1_09=d10*v1_10+v0_10,
-                 v1_08=d09*v1_09+v0_09,v1_07=d08*v1_08+v0_08,
-                 v1_06=d07*v1_07+v0_07,v1_05=d06*v1_06+v0_06,
-                 v1_04=d05*v1_05+v0_05,v1_03=d04*v1_04+v0_04,
-                 v1_02=d03*v1_03+v0_03,v1_01=d02*v1_02+v0_02,
-                 v1_00=d01*v1_01+v0_01;
-    p[23+ 0]=2*w[ 0]*(                  v1_00);
-    p[23+ 1]=2*w[ 1]*(    1*v0_01+u0_01*v1_01);
-    p[23+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02);
-    p[23+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03);
-    p[23+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04);
-    p[23+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05);
-    p[23+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06);
-    p[23+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07);
-    p[23+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08);
-    p[23+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09);
-    p[23+10]=2*w[10]*(u1_10*v0_10+u0_10*v1_10);
-    p[23+11]=2*w[11]*(u1_11*v0_11+u0_11*v1_11);
-    p[23+12]=2*w[12]*(u1_12*v0_12+u0_12*v1_12);
-    p[23+13]=2*w[13]*(u1_13*v0_13+u0_13*v1_13);
-    p[23+14]=2*w[14]*(u1_14*v0_14+u0_14*v1_14);
-    p[23+15]=2*w[15]*(u1_15*v0_15+u0_15*v1_15);
-    p[23+16]=2*w[16]*(u1_16*v0_16+u0_16*v1_16);
-    p[23+17]=2*w[17]*(u1_17*v0_17+u0_17*v1_17);
-    p[23+18]=2*w[18]*(u1_18*v0_18+u0_18*v1_18);
-    p[23+19]=2*w[19]*(u1_19*v0_19+u0_19*v1_19);
-    p[23+20]=2*w[20]*(u1_20*v0_20+u0_20*v1_20);
-    p[23+21]=2*w[21]*(u1_21*v0_21+u0_21*    1);
-    p[23+22]=2*w[22]*(u1_22                  );
-    if(d>1) {
-      const double u2_03=    2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03,
-                   u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05,
-                   u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07,
-                   u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09,
-                   u2_11=u2_10*d10+2*u1_10,u2_12=u2_11*d11+2*u1_11,
-                   u2_13=u2_12*d12+2*u1_12,u2_14=u2_13*d13+2*u1_13,
-                   u2_15=u2_14*d14+2*u1_14,u2_16=u2_15*d15+2*u1_15,
-                   u2_17=u2_16*d16+2*u1_16,u2_18=u2_17*d17+2*u1_17,
-                   u2_19=u2_18*d18+2*u1_18,u2_20=u2_19*d19+2*u1_19,
-                   u2_21=u2_20*d20+2*u1_20,u2_22=u2_21*d21+2*u1_21;
-      const double v2_19=d20*    2+2*v1_20,v2_18=d19*v2_19+2*v1_19,
-                   v2_17=d18*v2_18+2*v1_18,v2_16=d17*v2_17+2*v1_17,
-                   v2_15=d16*v2_16+2*v1_16,v2_14=d15*v2_15+2*v1_15,
-                   v2_13=d14*v2_14+2*v1_14,v2_12=d13*v2_13+2*v1_13,
-                   v2_11=d12*v2_12+2*v1_12,v2_10=d11*v2_11+2*v1_11,
-                   v2_09=d10*v2_10+2*v1_10,v2_08=d09*v2_09+2*v1_09,
-                   v2_07=d08*v2_08+2*v1_08,v2_06=d07*v2_07+2*v1_07,
-                   v2_05=d06*v2_06+2*v1_06,v2_04=d05*v2_05+2*v1_05,
-                   v2_03=d04*v2_04+2*v1_04,v2_02=d03*v2_03+2*v1_03,
-                   v2_01=d02*v2_02+2*v1_02,v2_00=d01*v2_01+2*v1_01;
-      p[2*23+ 0]=4*w[ 0]*(                         +    1*v2_00);
-      p[2*23+ 1]=4*w[ 1]*(           +2*    1*v1_01+u0_01*v2_01);
-      p[2*23+ 2]=4*w[ 2]*(    2*v0_02+2*u1_02*v1_02+u0_02*v2_02);
-      p[2*23+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03);
-      p[2*23+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04);
-      p[2*23+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05);
-      p[2*23+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06);
-      p[2*23+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07);
-      p[2*23+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08);
-      p[2*23+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*v2_09);
-      p[2*23+10]=4*w[10]*(u2_10*v0_10+2*u1_10*v1_10+u0_10*v2_10);
-      p[2*23+11]=4*w[11]*(u2_11*v0_11+2*u1_11*v1_11+u0_11*v2_11);
-      p[2*23+12]=4*w[12]*(u2_12*v0_12+2*u1_12*v1_12+u0_12*v2_12);
-      p[2*23+13]=4*w[13]*(u2_13*v0_13+2*u1_13*v1_13+u0_13*v2_13);
-      p[2*23+14]=4*w[14]*(u2_14*v0_14+2*u1_14*v1_14+u0_14*v2_14);
-      p[2*23+15]=4*w[15]*(u2_15*v0_15+2*u1_15*v1_15+u0_15*v2_15);
-      p[2*23+16]=4*w[16]*(u2_16*v0_16+2*u1_16*v1_16+u0_16*v2_16);
-      p[2*23+17]=4*w[17]*(u2_17*v0_17+2*u1_17*v1_17+u0_17*v2_17);
-      p[2*23+18]=4*w[18]*(u2_18*v0_18+2*u1_18*v1_18+u0_18*v2_18);
-      p[2*23+19]=4*w[19]*(u2_19*v0_19+2*u1_19*v1_19+u0_19*v2_19);
-      p[2*23+20]=4*w[20]*(u2_20*v0_20+2*u1_20*v1_20+u0_20*    2);
-      p[2*23+21]=4*w[21]*(u2_21*v0_21+2*u1_21*    1            );
-      p[2*23+22]=4*w[22]*(u2_22*    1                          );
-    }
-  }
-}
-
-static const double gllz_24[11] = {
-  0.9867305535051608835530867381544749753719197924133,
-  0.95574822092988635802697713055064483107073304295574,
-  0.90770567511350652199515299646620774920842011387828,
-  0.84346407015487204062330503742334228584107610081033,
-  0.7641704824204933077873752809522936513210604492369,
-  0.67124010526412869983566485818700675657402328894643,
-  0.56633135797929531218940954454228377043889499712648,
-  0.45131637321432261824821849156962244882308821831249,
-  0.3282476133755109120333891793596093437011778687727,
-  0.19932125339083266723657253912499073081187559142148,
-  0.066837993737228578113641808391677309796223208917628
-};
-
-static void gll_lag_24(double *restrict p, double *restrict w,
-                       unsigned n, int d, double xh)
-{
-  const double x = xh*2;
-  const double d00=x+2            ,d01=x+2*gllz_24[ 0],d02=x+2*gllz_24[ 1],
-               d03=x+2*gllz_24[ 2],d04=x+2*gllz_24[ 3],d05=x+2*gllz_24[ 4],
-               d06=x+2*gllz_24[ 5],d07=x+2*gllz_24[ 6],d08=x+2*gllz_24[ 7],
-               d09=x+2*gllz_24[ 8],d10=x+2*gllz_24[ 9],d11=x+2*gllz_24[10],
-               d12=x-2*gllz_24[10],d13=x-2*gllz_24[ 9],d14=x-2*gllz_24[ 8],
-               d15=x-2*gllz_24[ 7],d16=x-2*gllz_24[ 6],d17=x-2*gllz_24[ 5],
-               d18=x-2*gllz_24[ 4],d19=x-2*gllz_24[ 3],d20=x-2*gllz_24[ 2],
-               d21=x-2*gllz_24[ 1],d22=x-2*gllz_24[ 0],d23=x-2            ;
-  const double u0_01=    1*d00,u0_02=u0_01*d01,u0_03=u0_02*d02,
-               u0_04=u0_03*d03,u0_05=u0_04*d04,u0_06=u0_05*d05,
-               u0_07=u0_06*d06,u0_08=u0_07*d07,u0_09=u0_08*d08,
-               u0_10=u0_09*d09,u0_11=u0_10*d10,u0_12=u0_11*d11,
-               u0_13=u0_12*d12,u0_14=u0_13*d13,u0_15=u0_14*d14,
-               u0_16=u0_15*d15,u0_17=u0_16*d16,u0_18=u0_17*d17,
-               u0_19=u0_18*d18,u0_20=u0_19*d19,u0_21=u0_20*d20,
-               u0_22=u0_21*d21,u0_23=u0_22*d22;
-  const double v0_22=d23*    1,v0_21=d22*v0_22,v0_20=d21*v0_21,
-               v0_19=d20*v0_20,v0_18=d19*v0_19,v0_17=d18*v0_18,
-               v0_16=d17*v0_17,v0_15=d16*v0_16,v0_14=d15*v0_15,
-               v0_13=d14*v0_14,v0_12=d13*v0_13,v0_11=d12*v0_12,
-               v0_10=d11*v0_11,v0_09=d10*v0_10,v0_08=d09*v0_09,
-               v0_07=d08*v0_08,v0_06=d07*v0_07,v0_05=d06*v0_06,
-               v0_04=d05*v0_05,v0_03=d04*v0_04,v0_02=d03*v0_03,
-               v0_01=d02*v0_02,v0_00=d01*v0_01;
-  p[ 0]=w[ 0]*    1*v0_00; p[ 1]=w[ 1]*u0_01*v0_01; p[ 2]=w[ 2]*u0_02*v0_02;
-  p[ 3]=w[ 3]*u0_03*v0_03; p[ 4]=w[ 4]*u0_04*v0_04; p[ 5]=w[ 5]*u0_05*v0_05;
-  p[ 6]=w[ 6]*u0_06*v0_06; p[ 7]=w[ 7]*u0_07*v0_07; p[ 8]=w[ 8]*u0_08*v0_08;
-  p[ 9]=w[ 9]*u0_09*v0_09; p[10]=w[10]*u0_10*v0_10; p[11]=w[11]*u0_11*v0_11;
-  p[12]=w[12]*u0_12*v0_12; p[13]=w[13]*u0_13*v0_13; p[14]=w[14]*u0_14*v0_14;
-  p[15]=w[15]*u0_15*v0_15; p[16]=w[16]*u0_16*v0_16; p[17]=w[17]*u0_17*v0_17;
-  p[18]=w[18]*u0_18*v0_18; p[19]=w[19]*u0_19*v0_19; p[20]=w[20]*u0_20*v0_20;
-  p[21]=w[21]*u0_21*v0_21; p[22]=w[22]*u0_22*v0_22; p[23]=w[23]*u0_23*    1;
-  if(d>0) {
-    const double u1_02=    1*d01+u0_01,u1_03=u1_02*d02+u0_02,
-                 u1_04=u1_03*d03+u0_03,u1_05=u1_04*d04+u0_04,
-                 u1_06=u1_05*d05+u0_05,u1_07=u1_06*d06+u0_06,
-                 u1_08=u1_07*d07+u0_07,u1_09=u1_08*d08+u0_08,
-                 u1_10=u1_09*d09+u0_09,u1_11=u1_10*d10+u0_10,
-                 u1_12=u1_11*d11+u0_11,u1_13=u1_12*d12+u0_12,
-                 u1_14=u1_13*d13+u0_13,u1_15=u1_14*d14+u0_14,
-                 u1_16=u1_15*d15+u0_15,u1_17=u1_16*d16+u0_16,
-                 u1_18=u1_17*d17+u0_17,u1_19=u1_18*d18+u0_18,
-                 u1_20=u1_19*d19+u0_19,u1_21=u1_20*d20+u0_20,
-                 u1_22=u1_21*d21+u0_21,u1_23=u1_22*d22+u0_22;
-    const double v1_21=d22*    1+v0_22,v1_20=d21*v1_21+v0_21,
-                 v1_19=d20*v1_20+v0_20,v1_18=d19*v1_19+v0_19,
-                 v1_17=d18*v1_18+v0_18,v1_16=d17*v1_17+v0_17,
-                 v1_15=d16*v1_16+v0_16,v1_14=d15*v1_15+v0_15,
-                 v1_13=d14*v1_14+v0_14,v1_12=d13*v1_13+v0_13,
-                 v1_11=d12*v1_12+v0_12,v1_10=d11*v1_11+v0_11,
-                 v1_09=d10*v1_10+v0_10,v1_08=d09*v1_09+v0_09,
-                 v1_07=d08*v1_08+v0_08,v1_06=d07*v1_07+v0_07,
-                 v1_05=d06*v1_06+v0_06,v1_04=d05*v1_05+v0_05,
-                 v1_03=d04*v1_04+v0_04,v1_02=d03*v1_03+v0_03,
-                 v1_01=d02*v1_02+v0_02,v1_00=d01*v1_01+v0_01;
-    p[24+ 0]=2*w[ 0]*(                  v1_00);
-    p[24+ 1]=2*w[ 1]*(    1*v0_01+u0_01*v1_01);
-    p[24+ 2]=2*w[ 2]*(u1_02*v0_02+u0_02*v1_02);
-    p[24+ 3]=2*w[ 3]*(u1_03*v0_03+u0_03*v1_03);
-    p[24+ 4]=2*w[ 4]*(u1_04*v0_04+u0_04*v1_04);
-    p[24+ 5]=2*w[ 5]*(u1_05*v0_05+u0_05*v1_05);
-    p[24+ 6]=2*w[ 6]*(u1_06*v0_06+u0_06*v1_06);
-    p[24+ 7]=2*w[ 7]*(u1_07*v0_07+u0_07*v1_07);
-    p[24+ 8]=2*w[ 8]*(u1_08*v0_08+u0_08*v1_08);
-    p[24+ 9]=2*w[ 9]*(u1_09*v0_09+u0_09*v1_09);
-    p[24+10]=2*w[10]*(u1_10*v0_10+u0_10*v1_10);
-    p[24+11]=2*w[11]*(u1_11*v0_11+u0_11*v1_11);
-    p[24+12]=2*w[12]*(u1_12*v0_12+u0_12*v1_12);
-    p[24+13]=2*w[13]*(u1_13*v0_13+u0_13*v1_13);
-    p[24+14]=2*w[14]*(u1_14*v0_14+u0_14*v1_14);
-    p[24+15]=2*w[15]*(u1_15*v0_15+u0_15*v1_15);
-    p[24+16]=2*w[16]*(u1_16*v0_16+u0_16*v1_16);
-    p[24+17]=2*w[17]*(u1_17*v0_17+u0_17*v1_17);
-    p[24+18]=2*w[18]*(u1_18*v0_18+u0_18*v1_18);
-    p[24+19]=2*w[19]*(u1_19*v0_19+u0_19*v1_19);
-    p[24+20]=2*w[20]*(u1_20*v0_20+u0_20*v1_20);
-    p[24+21]=2*w[21]*(u1_21*v0_21+u0_21*v1_21);
-    p[24+22]=2*w[22]*(u1_22*v0_22+u0_22*    1);
-    p[24+23]=2*w[23]*(u1_23                  );
-    if(d>1) {
-      const double u2_03=    2*d02+2*u1_02,u2_04=u2_03*d03+2*u1_03,
-                   u2_05=u2_04*d04+2*u1_04,u2_06=u2_05*d05+2*u1_05,
-                   u2_07=u2_06*d06+2*u1_06,u2_08=u2_07*d07+2*u1_07,
-                   u2_09=u2_08*d08+2*u1_08,u2_10=u2_09*d09+2*u1_09,
-                   u2_11=u2_10*d10+2*u1_10,u2_12=u2_11*d11+2*u1_11,
-                   u2_13=u2_12*d12+2*u1_12,u2_14=u2_13*d13+2*u1_13,
-                   u2_15=u2_14*d14+2*u1_14,u2_16=u2_15*d15+2*u1_15,
-                   u2_17=u2_16*d16+2*u1_16,u2_18=u2_17*d17+2*u1_17,
-                   u2_19=u2_18*d18+2*u1_18,u2_20=u2_19*d19+2*u1_19,
-                   u2_21=u2_20*d20+2*u1_20,u2_22=u2_21*d21+2*u1_21,
-                   u2_23=u2_22*d22+2*u1_22;
-      const double v2_20=d21*    2+2*v1_21,v2_19=d20*v2_20+2*v1_20,
-                   v2_18=d19*v2_19+2*v1_19,v2_17=d18*v2_18+2*v1_18,
-                   v2_16=d17*v2_17+2*v1_17,v2_15=d16*v2_16+2*v1_16,
-                   v2_14=d15*v2_15+2*v1_15,v2_13=d14*v2_14+2*v1_14,
-                   v2_12=d13*v2_13+2*v1_13,v2_11=d12*v2_12+2*v1_12,
-                   v2_10=d11*v2_11+2*v1_11,v2_09=d10*v2_10+2*v1_10,
-                   v2_08=d09*v2_09+2*v1_09,v2_07=d08*v2_08+2*v1_08,
-                   v2_06=d07*v2_07+2*v1_07,v2_05=d06*v2_06+2*v1_06,
-                   v2_04=d05*v2_05+2*v1_05,v2_03=d04*v2_04+2*v1_04,
-                   v2_02=d03*v2_03+2*v1_03,v2_01=d02*v2_02+2*v1_02,
-                   v2_00=d01*v2_01+2*v1_01;
-      p[2*24+ 0]=4*w[ 0]*(                         +    1*v2_00);
-      p[2*24+ 1]=4*w[ 1]*(           +2*    1*v1_01+u0_01*v2_01);
-      p[2*24+ 2]=4*w[ 2]*(    2*v0_02+2*u1_02*v1_02+u0_02*v2_02);
-      p[2*24+ 3]=4*w[ 3]*(u2_03*v0_03+2*u1_03*v1_03+u0_03*v2_03);
-      p[2*24+ 4]=4*w[ 4]*(u2_04*v0_04+2*u1_04*v1_04+u0_04*v2_04);
-      p[2*24+ 5]=4*w[ 5]*(u2_05*v0_05+2*u1_05*v1_05+u0_05*v2_05);
-      p[2*24+ 6]=4*w[ 6]*(u2_06*v0_06+2*u1_06*v1_06+u0_06*v2_06);
-      p[2*24+ 7]=4*w[ 7]*(u2_07*v0_07+2*u1_07*v1_07+u0_07*v2_07);
-      p[2*24+ 8]=4*w[ 8]*(u2_08*v0_08+2*u1_08*v1_08+u0_08*v2_08);
-      p[2*24+ 9]=4*w[ 9]*(u2_09*v0_09+2*u1_09*v1_09+u0_09*v2_09);
-      p[2*24+10]=4*w[10]*(u2_10*v0_10+2*u1_10*v1_10+u0_10*v2_10);
-      p[2*24+11]=4*w[11]*(u2_11*v0_11+2*u1_11*v1_11+u0_11*v2_11);
-      p[2*24+12]=4*w[12]*(u2_12*v0_12+2*u1_12*v1_12+u0_12*v2_12);
-      p[2*24+13]=4*w[13]*(u2_13*v0_13+2*u1_13*v1_13+u0_13*v2_13);
-      p[2*24+14]=4*w[14]*(u2_14*v0_14+2*u1_14*v1_14+u0_14*v2_14);
-      p[2*24+15]=4*w[15]*(u2_15*v0_15+2*u1_15*v1_15+u0_15*v2_15);
-      p[2*24+16]=4*w[16]*(u2_16*v0_16+2*u1_16*v1_16+u0_16*v2_16);
-      p[2*24+17]=4*w[17]*(u2_17*v0_17+2*u1_17*v1_17+u0_17*v2_17);
-      p[2*24+18]=4*w[18]*(u2_18*v0_18+2*u1_18*v1_18+u0_18*v2_18);
-      p[2*24+19]=4*w[19]*(u2_19*v0_19+2*u1_19*v1_19+u0_19*v2_19);
-      p[2*24+20]=4*w[20]*(u2_20*v0_20+2*u1_20*v1_20+u0_20*v2_20);
-      p[2*24+21]=4*w[21]*(u2_21*v0_21+2*u1_21*v1_21+u0_21*    2);
-      p[2*24+22]=4*w[22]*(u2_22*v0_22+2*u1_22*    1            );
-      p[2*24+23]=4*w[23]*(u2_23*    1                          );
-    }
-  }
-}
-
-static const double *const gllz_table[21] = {
-  gllz_04, gllz_05, gllz_06, gllz_07, gllz_08, gllz_09, gllz_10, gllz_11,
-  gllz_12, gllz_13, gllz_14, gllz_15, gllz_16, gllz_17, gllz_18, gllz_19,
-  gllz_20, gllz_21, gllz_22, gllz_23, gllz_24
-};
-
-static lagrange_fun *const gll_lag_table[23] = {
-  &gll_lag_02, &gll_lag_03, &gll_lag_04, &gll_lag_05, &gll_lag_06, &gll_lag_07,
-  &gll_lag_08, &gll_lag_09, &gll_lag_10, &gll_lag_11, &gll_lag_12, &gll_lag_13,
-  &gll_lag_14, &gll_lag_15, &gll_lag_16, &gll_lag_17, &gll_lag_18, &gll_lag_19,
-  &gll_lag_20, &gll_lag_21, &gll_lag_22, &gll_lag_23, &gll_lag_24
-};
-
diff --git a/3rdParty/gslib/src/rand_elt_test.c b/3rdParty/gslib/src/rand_elt_test.c
deleted file mode 100644
index 1e11dae96..000000000
--- a/3rdParty/gslib/src/rand_elt_test.c
+++ /dev/null
@@ -1,169 +0,0 @@
-#include <stdlib.h>
-#include <math.h>
-#include "c99.h"
-#include "types.h"
-#include "name.h"
-#include "poly.h"
-#include "lob_bnd.h"
-
-static double det_2(const double A[4]) { return A[0]*A[3]-A[1]*A[2]; }
-
-static double quad_2(const double x0, const double g[2], const double H[3],
-                     const double r[2])
-{
-  return x0 + (g[0]*r[0]+g[1]*r[1])
-            + (  r[0] * (H[0]*r[0]+H[1]*r[1])
-               + r[1] * (H[1]*r[0]+H[2]*r[1]) )/2;
-}
-
-static void quad_2_grad(double grad[2], const double g[2], const double H[3],
-                        const double r[2])
-{
-  grad[0] = g[0] + (H[0]*r[0]+H[1]*r[1]);
-  grad[1] = g[1] + (H[1]*r[0]+H[2]*r[1]);
-}
-
-static double quad_2_jac(const double g[4], const double H[6],
-                         const double r[2])
-{
-  double J[4];
-  quad_2_grad(J  ,g  ,H  ,r);
-  quad_2_grad(J+2,g+2,H+3,r);
-  return det_2(J);
-}
-
-static double det_3(const double A[9])
-{
-  const double a = A[4]*A[8]-A[5]*A[7],
-               b = A[5]*A[6]-A[3]*A[8],
-               c = A[3]*A[7]-A[4]*A[6];
-  return A[0]*a+A[1]*b+A[2]*c;
-}
-
-static double quad_3(const double x0, const double g[3], const double H[6],
-                     const double r[3])
-{
-  return x0 + (g[0]*r[0]+g[1]*r[1]+g[2]*r[2])
-            + (  r[0] * (H[0]*r[0]+H[1]*r[1]+H[2]*r[2])
-               + r[1] * (H[1]*r[0]+H[3]*r[1]+H[4]*r[2])
-               + r[2] * (H[2]*r[0]+H[4]*r[1]+H[5]*r[2]) )/2;
-}
-
-static void quad_3_grad(double grad[3], const double g[3], const double H[6],
-                        const double r[3])
-{
-  grad[0] = g[0] + (H[0]*r[0]+H[1]*r[1]+H[2]*r[2]);
-  grad[1] = g[1] + (H[1]*r[0]+H[3]*r[1]+H[4]*r[2]);
-  grad[2] = g[2] + (H[2]*r[0]+H[4]*r[1]+H[5]*r[2]);
-}
-
-static double quad_3_jac(const double g[9], const double H[18],
-                         const double r[3])
-{
-  double J[9];
-  quad_3_grad(J  ,g  ,H   ,r);
-  quad_3_grad(J+3,g+3,H+ 6,r);
-  quad_3_grad(J+6,g+6,H+12,r);
-  return det_3(J);
-}
-
-void rand_elt_2(double *x, double *y,
-                const double *zr, unsigned nr,
-                const double *zs, unsigned ns)
-{
-  static int init=0;
-  static double z4[4], lob_bnd_data[16+3*4*(2*16+1)],
-                work[2*16*(4+16+1)];
-  unsigned i,j;
-  double x0[2], g[4], H[6], jac[4*4], r[2];
-  struct dbl_range jr;
-  if(!init) {
-    init=1;
-    lobatto_nodes(z4,4);
-    lob_bnd_setup(lob_bnd_data,4,16);
-  }
-  do {
-    for(i=0;i<4;++i) g[i] = -1+2*(rand()/(double)RAND_MAX);
-    for(i=0;i<6;++i) H[i] =.5*(-1+2*(rand()/(double)RAND_MAX));
-    for(j=0;j<4;++j) { r[1] = z4[j];
-      for(i=0;i<4;++i) { r[0] = z4[i];
-        jac[j*4+i] = quad_2_jac(g,H,r);
-      }
-    }
-    jr = lob_bnd_2(lob_bnd_data,4,16, lob_bnd_data,4,16, jac, work);
-    /*printf("Jacobian range %g, %g\n", jr.min, jr.max);*/
-  } while(jr.max*jr.min<=0);
-  for(i=0;i< 2;++i) x0[i] = -1+2*(rand()/(double)RAND_MAX);
-  for(j=0;j<ns;++j) {   r[1] = zs[j];
-    for(i=0;i<nr;++i) { r[0] = zr[i];
-      x[j*nr+i] = quad_2(x0[0],g  ,H  ,r);
-      y[j*nr+i] = quad_2(x0[1],g+2,H+3,r);
-    }
-  }
-}
-
-void rand_elt_3(double *x, double *y, double *z,
-                const double *zr, unsigned nr,
-                const double *zs, unsigned ns,
-                const double *zt, unsigned nt)
-{
-  static int init=0;
-  static double z4[4], lob_bnd_data[16+3*4*(2*16+1)],
-                work[2*16*16*(4+16+1)];
-  unsigned i,j,k;
-  double x0[3], g[9], H[18], jac[4*4*4], r[3];
-  struct dbl_range jr;
-  if(!init) {
-    init=1;
-    lobatto_nodes(z4,4);
-    lob_bnd_setup(lob_bnd_data,4,16);
-  }
-  do {
-    for(i=0;i< 9;++i) g[i] = -1+2*(rand()/(double)RAND_MAX);
-    for(i=0;i<18;++i) H[i] =.5*(-1+2*(rand()/(double)RAND_MAX));
-    for(k=0;k<4;++k) { r[2] = z4[k];
-      for(j=0;j<4;++j) { r[1] = z4[j];
-        for(i=0;i<4;++i) { r[0] = z4[i];
-          jac[(k*4+j)*4+i] = quad_3_jac(g,H,r);
-        }
-      }
-    }
-    jr = lob_bnd_3(lob_bnd_data,4,16, lob_bnd_data,4,16, lob_bnd_data,4,16,
-                   jac, work);
-    /*printf("Jacobian range %g, %g\n", jr.min, jr.max);*/
-  } while(jr.max*jr.min<=0);
-  for(i=0;i< 3;++i) x0[i] = -1+2*(rand()/(double)RAND_MAX);
-  for(k=0;k<nt;++k) {     r[2] = zt[k];
-    for(j=0;j<ns;++j) {   r[1] = zs[j];
-      for(i=0;i<nr;++i) { r[0] = zr[i];
-        x[(k*ns+j)*nr+i] = quad_3(x0[0],g  ,H   ,r);
-        y[(k*ns+j)*nr+i] = quad_3(x0[1],g+3,H+ 6,r);
-        z[(k*ns+j)*nr+i] = quad_3(x0[2],g+6,H+12,r);
-      }
-    }
-  }
-}
-
-#define PI 3.1415926535897932384626433832795028841971693993751058209749445923
-
-void bubble_elt(double *x, double *y, double *z,
-                const double *zr, unsigned nr,
-                const double *zs, unsigned ns,
-                const double *zt, unsigned nt, int type)
-{
-  unsigned i,j,k;
-  for(k=0;k<nt;++k) for(j=0;j<ns;++j) for(i=0;i<nr;++i) {
-    double dx=0,dy=0,dz=0;
-    switch(type) {
-      case 0: dx =  cos(PI*zs[j]/2)*cos(PI*zt[k]/2); break;
-      case 1: dx = -cos(PI*zs[j]/2)*cos(PI*zt[k]/2); break;
-      case 2: dy =  cos(PI*zt[k]/2)*cos(PI*zr[i]/2); break;
-      case 3: dy = -cos(PI*zt[k]/2)*cos(PI*zr[i]/2); break;
-      case 4: dz =  cos(PI*zr[i]/2)*cos(PI*zs[j]/2); break;
-      case 5: dz = -cos(PI*zr[i]/2)*cos(PI*zs[j]/2); break;
-    }
-    x[(k*ns+j)*nr+i] = zr[i] + dx;
-    y[(k*ns+j)*nr+i] = zs[j] + dy;
-    z[(k*ns+j)*nr+i] = zt[k] + dz;
-  }
-}
diff --git a/3rdParty/gslib/src/rand_elt_test.h b/3rdParty/gslib/src/rand_elt_test.h
deleted file mode 100644
index 40c53251e..000000000
--- a/3rdParty/gslib/src/rand_elt_test.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef RAND_ELT_TEST_H
-#define RAND_ELT_TEST_H
-
-void rand_elt_2(double *x, double *y,
-                const double *zr, unsigned nr,
-                const double *zs, unsigned ns);
-
-void rand_elt_3(double *x, double *y, double *z,
-                const double *zr, unsigned nr,
-                const double *zs, unsigned ns,
-                const double *zt, unsigned nt);
-
-void bubble_elt(double *x, double *y, double *z,
-                const double *zr, unsigned nr,
-                const double *zs, unsigned ns,
-                const double *zt, unsigned nt, int type);
-
-#endif
diff --git a/3rdParty/gslib/src/sarray_sort.c b/3rdParty/gslib/src/sarray_sort.c
deleted file mode 100644
index 0ec26d185..000000000
--- a/3rdParty/gslib/src/sarray_sort.c
+++ /dev/null
@@ -1,45 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <string.h>
-#include "c99.h"
-#include "name.h"
-#include "types.h"
-#include "fail.h"
-#include "mem.h"
-#include "sort.h"
-
-#define sarray_permute_     PREFIXED_NAME(sarray_permute_)
-#define sarray_permute_buf_ PREFIXED_NAME(sarray_permute_buf_)
-
-void sarray_permute_(size_t size, void *A, size_t n, uint *perm, void *work)
-{
-  char *const ar = A, *const item = work;
-  sint *const fperm = (sint*)perm;
-  uint i;
-  for(i=0;i<n;++i) {
-    sint pi = fperm[i];
-    if(pi<0) { fperm[i] = -pi-1; continue; }
-    else if((uint)pi==i) continue;
-    else {
-      char *dst = ar+i*size, *src = ar+pi*size;
-      memcpy(item, dst, size);
-      for(;;) {
-        sint ppi;
-        memcpy(dst, src, size);
-        dst=src;
-        ppi=fperm[pi], fperm[pi]=-ppi-1, pi=ppi;
-        if((uint)pi==i) break;
-        src=ar+pi*size;
-      }
-      memcpy(dst, item, size);
-    }
-  }
-}
-
-void sarray_permute_buf_(size_t align, size_t size, void *A, size_t n,
-                         buffer *buf)
-{
-  buffer_reserve(buf,align_as_(align,n*sizeof(uint)+size));
-  sarray_permute_(size,A,n, buf->ptr,
-                 (char*)buf->ptr + align_as_(align,n*sizeof(uint)));
-}
diff --git a/3rdParty/gslib/src/sarray_sort.h b/3rdParty/gslib/src/sarray_sort.h
deleted file mode 100644
index 77dc6531e..000000000
--- a/3rdParty/gslib/src/sarray_sort.h
+++ /dev/null
@@ -1,89 +0,0 @@
-#ifndef SARRAY_SORT_H
-#define SARRAY_SORT_H
-
-#if !defined(SORT_H)
-#warning "sarray_sort.h" requires "sort.h"
-#endif
-
-/*------------------------------------------------------------------------------
-  
-  Array of Structs Sort
-  
-  buffer *buf;
-  typedef struct { ... } T;
-  T A[n];
-
-  sarray_sort(T,A,n, field_name,is_long, buf)
-    - sort A according to the struct field "field_name",
-      which is a ulong/uint field according as is_long is true/false
-
-  sarray_sort_2(T,A,n, field1,is_long1, field2,is_long2, buf)
-    - sort A by field1 then field2
-
-  sarray_permute(T,A,n, perm, work)
-    - permute A  (in-place)
-      A[0] <- A[perm[0]], etc.
-      work needs to hold sizeof(T) bytes  (i.e., 1 T)
-
-  sarray_permute_buf(T,A,n, buf);
-    - permute A according to the permutation in buf
-      A[0] <- A[perm[0]], etc.
-      where uint *perm = buf->ptr   (see "sort.h")
-
-  ----------------------------------------------------------------------------*/
-
-
-#define sarray_permute_     PREFIXED_NAME(sarray_permute_)
-#define sarray_permute_buf_ PREFIXED_NAME(sarray_permute_buf_)
-
-void sarray_permute_(size_t size, void *A, size_t n, uint *perm, void *work);
-void sarray_permute_buf_(
-  size_t align, size_t size, void *A, size_t n, buffer *buf);
-
-#define sarray_permute(T,A,n, perm, work) \
-  sarray_permute_(sizeof(T),A,n, perm, work)
-#define sarray_permute_buf(T,A,n, buf) \
-  sarray_permute_buf_(ALIGNOF(T),sizeof(T),A,n,buf)
-
-#define sarray_sort_field(T,A,n, field,is_long, buf,keep) do { \
-  if(is_long) \
-    sortp_long(buf,keep, (ulong*)((char*)(A)+offsetof(T,field)),n,sizeof(T)); \
-  else \
-    sortp     (buf,keep, (uint *)((char*)(A)+offsetof(T,field)),n,sizeof(T)); \
-} while (0)
-
-#define sarray_sort(T,A,n, field,is_long, buf) do { \
-  sarray_sort_field(T,A,n, field,is_long, buf,0); \
-  sarray_permute_buf(T,A,n, buf); \
-} while (0)
-
-#define sarray_sort_2(T,A,n, field1,is_long1, field2,is_long2, buf) do { \
-  sarray_sort_field(T,A,n, field2,is_long2, buf,0); \
-  sarray_sort_field(T,A,n, field1,is_long1, buf,1); \
-  sarray_permute_buf(T,A,n, buf); \
-} while (0)
-
-#define sarray_sort_3(T,A,n, field1,is_long1, field2,is_long2, \
-                             field3,is_long3, buf) do { \
-  sarray_sort_field(T,A,n, field3,is_long3, buf,0); \
-  sarray_sort_field(T,A,n, field2,is_long2, buf,1); \
-  sarray_sort_field(T,A,n, field1,is_long1, buf,1); \
-  sarray_permute_buf(T,A,n, buf); \
-} while (0)
-
-#define sarray_sort_4(T,A,n, field1,is_long1, field2,is_long2, \
-                             field3,is_long3, field4,is_long4, buf) do { \
-  sarray_sort_field(T,A,n, field4,is_long4, buf,0); \
-  sarray_sort_field(T,A,n, field3,is_long3, buf,1); \
-  sarray_sort_field(T,A,n, field2,is_long2, buf,1); \
-  sarray_sort_field(T,A,n, field1,is_long1, buf,1); \
-  sarray_permute_buf(T,A,n, buf); \
-} while (0)
-
-static void sarray_perm_invert(
-  uint *const pinv, const uint *const perm, const uint n)
-{
-  uint i; for(i=0;i<n;++i) pinv[perm[i]] = i;
-}
-
-#endif
diff --git a/3rdParty/gslib/src/sarray_transfer.c b/3rdParty/gslib/src/sarray_transfer.c
deleted file mode 100644
index 18e80b0aa..000000000
--- a/3rdParty/gslib/src/sarray_transfer.c
+++ /dev/null
@@ -1,198 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include "c99.h"
-#include "name.h"
-#include "fail.h"
-#include "types.h"
-#include "comm.h"
-#include "mem.h"
-#include "crystal.h"
-#include "sort.h"
-
-#define sarray_transfer_many PREFIXED_NAME(sarray_transfer_many)
-#define sarray_transfer_     PREFIXED_NAME(sarray_transfer_    )
-#define sarray_transfer_ext_ PREFIXED_NAME(sarray_transfer_ext_)
-
-static void pack_int(
-  buffer *const data, const unsigned row_size, const uint id,
-  const char *const restrict input, const uint n, const unsigned size,
-  const unsigned p_off, const uint *const restrict perm)
-{
-  const unsigned after = p_off + sizeof(uint), after_len = size-after;
-
-#define GET_P() memcpy(&p,row+p_off,sizeof(uint))
-#define COPY_ROW() memcpy(out,row,p_off), \
-                   memcpy((char*)out + p_off,row+after,after_len)
-
-#define PACK_BODY() do {                                                  \
-  uint dummy, *len_ptr=&dummy;                                            \
-  uint i, p,lp = UINT_MAX, len=0;                                         \
-  uint *restrict out = buffer_reserve(data, n*(row_size+3)*sizeof(uint)); \
-  for(i=0;i<n;++i) {                                                      \
-    const char *row = input + size*perm[i];                               \
-    GET_P();                                                              \
-    if(p!=lp) {                                                           \
-      lp = p;                                                             \
-      *len_ptr = len;       /* previous message length */                 \
-      *out++ = p;           /* target */                                  \
-      *out++ = id;          /* source */                                  \
-      len_ptr=out++; len=0; /* length (t.b.d.) */                         \
-    }                                                                     \
-    COPY_ROW();                                                           \
-    out += row_size, len += row_size;                                     \
-  }                                                                       \
-  *len_ptr = len; /* last message length */                               \
-  data->n = out - (uint*)data->ptr;                                       \
-} while(0)
-  PACK_BODY();
-#undef COPY_ROW
-#undef GET_P
-}
-
-static void pack_ext(
-  buffer *const data, const unsigned row_size, const uint id,
-  const char *const restrict input, const uint n, const unsigned size,
-  const uint *const restrict proc, const unsigned proc_stride,
-  const uint *const restrict perm)
-{
-  #define GET_P() p=*(const uint*)((const char*)proc+proc_stride*perm[i])
-  #define COPY_ROW() memcpy(out,row,size)
-  PACK_BODY();
-  #undef PACK_BODY
-  #undef COPY_ROW
-  #undef GET_P
-}
-
-static void pack_more(
-  buffer *const data, const unsigned off, const unsigned row_size,
-  const char *const restrict input, const unsigned size,
-  const uint *restrict perm)
-{
-  uint *restrict buf = data->ptr, *buf_end = buf+data->n;
-  while(buf!=buf_end) {
-    uint *msg_end = buf+3+buf[2]; buf+=3;
-    while(buf!=msg_end)
-      memcpy((char*)buf+off, input+size*(*perm++), size), buf+=row_size;
-  }
-}
-
-static void unpack_more(
-  char *restrict out, const unsigned size,
-  const buffer *const data, const unsigned off, const unsigned row_size)
-{
-  const uint *restrict buf = data->ptr, *buf_end = buf+data->n;
-  while(buf!=buf_end) {
-    const uint *msg_end = buf+3+buf[2]; buf+=3;
-    while(buf!=msg_end)
-      memcpy(out, (char*)buf+off, size), out+=size, buf+=row_size;
-  }
-}
-
-static void unpack_int(
-  char *restrict out, const unsigned size, const unsigned p_off,
-  const buffer *const data, const unsigned row_size, int set_src)
-{
-  const unsigned after = p_off + sizeof(uint), after_len = size-after;
-  const uint *restrict buf = data->ptr, *buf_end = buf+data->n;
-  const unsigned pi = set_src ? 1:0;
-  while(buf!=buf_end) {
-    const uint p=buf[pi], *msg_end = buf+3+buf[2]; buf+=3;
-    while(buf!=msg_end) {
-      memcpy(out,buf,p_off);
-      memcpy(out+p_off,&p,sizeof(uint));
-      memcpy(out+after,(const char *)buf+p_off,after_len);
-      out+=size, buf+=row_size;
-    }
-  }
-}
-
-static uint num_rows(const buffer *const data, const unsigned row_size)
-{
-  const uint *buf = data->ptr, *buf_end = buf + data->n;
-  uint n=0;
-  while(buf!=buf_end) { uint len=buf[2]; n+=len, buf+=len+3; }
-  return n/row_size;
-}
-
-static uint cap_rows(buffer *const data, const unsigned row_size,const uint max)
-{
-  uint *buf = data->ptr, *buf_end = buf + data->n;
-  const uint maxn = max*row_size;
-  uint n=0;
-  while(buf!=buf_end) {
-    uint len=buf[2]; n+=len;
-    if(n<maxn) buf+=len+3;
-    else {
-      buf[2]-=(maxn-n); data->n = (buf-(uint*)data->ptr)+3+buf[2];
-      buf+=len+3;
-      while(buf!=buf_end) { uint llen=buf[2]; n+=llen, buf+=llen+3; }
-      break;
-    }
-  }
-  return n/row_size;
-}
-
-/* An must be >= 1 */
-uint sarray_transfer_many(
-  struct array *const *const A, const unsigned *const size, const unsigned An,
-  const int fixed, const int ext, const int set_src, const unsigned p_off,
-  const uint *const restrict proc, const unsigned proc_stride,
-  struct crystal *const cr)
-{
-  uint n, *perm;
-  unsigned i,row_size,off,off1;
-
-  off1 = size[0];
-  if(!ext) off1 -= sizeof(uint);
-  row_size=off1; for(i=1;i<An;++i) row_size += size[i];
-  row_size = (row_size+sizeof(uint)-1)/sizeof(uint);
-
-  perm = sortp(&cr->work,0, proc,A[0]->n,proc_stride);
-
-  if(!ext) pack_int(&cr->data, row_size, cr->comm.id, A[0]->ptr,A[0]->n,size[0],
-                    p_off, perm);
-  else     pack_ext(&cr->data, row_size, cr->comm.id, A[0]->ptr,A[0]->n,size[0],
-                    proc,proc_stride, perm);
-  for(off=off1,i=1;i<An;++i) if(size[i])
-    pack_more(&cr->data,off,row_size, A[i]->ptr,size[i], perm),off+=size[i];
-
-  crystal_router(cr);
-
-  if(!fixed) {
-    n = num_rows(&cr->data,row_size);
-    for(i=0;i<An;++i)
-      array_reserve_(A[i],n,size[i],__FILE__,__LINE__), A[i]->n=n;
-  } else {
-    uint max=A[0]->max, an;
-    for(i=1;i<An;++i) if(A[i]->max<max) max=A[i]->max;
-    n = cap_rows(&cr->data,row_size, max);
-    an = n>max?max:n;
-    for(i=0;i<An;++i) A[i]->n=an;
-  }
-
-  if(!ext) unpack_int (A[0]->ptr,size[0],p_off, &cr->data,  row_size, set_src);
-  else     unpack_more(A[0]->ptr,size[0],       &cr->data,0,row_size);
-  for(off=off1,i=1;i<An;++i) if(size[i])
-    unpack_more(A[i]->ptr,size[i], &cr->data,off,row_size),off+=size[i];
-
-  return n;
-}
-
-
-void sarray_transfer_(struct array *const A, const unsigned size,
-                      const unsigned p_off, const int set_src,
-                      struct crystal *const cr)
-{
-  sarray_transfer_many(&A,&size,1, 0,0,set_src,p_off,
-                       (uint*)((char*)A->ptr+p_off),size, cr);
-}
-
-void sarray_transfer_ext_(struct array *const A, const unsigned size,
-                          const uint *const proc, const unsigned proc_stride,
-                          struct crystal *const cr)
-{
-  sarray_transfer_many(&A,&size,1, 0,1,0,0, proc,proc_stride, cr);
-}
-
diff --git a/3rdParty/gslib/src/sarray_transfer.h b/3rdParty/gslib/src/sarray_transfer.h
deleted file mode 100644
index c195e2174..000000000
--- a/3rdParty/gslib/src/sarray_transfer.h
+++ /dev/null
@@ -1,95 +0,0 @@
-#ifndef SARRAY_TRANSFER_H
-#define SARRAY_TRANSFER_H
-
-#if !defined(CRYSTAL_H)
-#warning "sarray_transfer.h" requires "crystal.h"
-#endif
-
-/*
-  High-level interface for the crystal router.
-  Given an array of structs, transfers each to the process indicated
-  by a field of the struct, which gets set to the source process on output.
-  
-  For the dynamic "array" type, see "mem.h".
-  
-  Requires a "crystal router" object:
-  
-    struct comm c;
-    struct crystal cr;
-    
-    comm_init(&c, MPI_COMM_WORLD);
-    crystal_init(&cr, &c);
-    
-  Example sarray_transfer usage:
-  
-    struct T { ...; uint proc; ...; };
-    struct array A = null_array;
-    struct T *p, *e;
-    
-    // resize A to 100 struct T's, fill up with data
-    p = array_reserve(struct T, &A, 100), A.n=100;
-    for(e=p+A.n;p!=e;++p) {
-      ...
-      p->proc = ...;
-      ...
-    }
-    
-    // array A represents the array
-    //   struct T ar[A.n]    where &ar[0] == A.ptr
-    // transfer ar[i] to processor ar[i].proc  for each i=0,...,A.n-1:
-    
-    sarray_transfer(struct T, A, proc,set_src, &cr);
-    
-    // now array A represents a different array with a different size
-    //   struct T ar[A.n]    where &ar[0] == A.ptr
-    // the ordering is arbitrary
-    // if set_src != 0, ar[i].proc is set to the proc where ar[i] came from
-    // otherwise ar[i].proc is unchanged (and == this proc id)
-    
-    // note: two calls of
-    sarray_transfer(struct T, A, proc,1, &cr);
-    // in a row should return A to its original state, up to ordering
- 
-  Cleanup:
-    array_free(&A);
-    crystal_free(&cr);
-    comm_free(&c);
-
-  Example sarray_transfer_ext usage:
-  
-    struct T { ... };
-    struct array A;
-    uint proc[A.n];
-    
-    // array A represents the array
-    //   struct T ar[A.n]    where &ar[0] == A.ptr
-    // transfer ar[i] to processor proc[i]  for each i=0,...,A.n-1:
-    sarray_transfer_ext(struct T, &A, proc, &cr);
-    
-    // no information is available now on where each struct came from
-
-*/
-
-#define sarray_transfer_many PREFIXED_NAME(sarray_transfer_many)
-#define sarray_transfer_     PREFIXED_NAME(sarray_transfer_    )
-#define sarray_transfer_ext_ PREFIXED_NAME(sarray_transfer_ext_)
-
-uint sarray_transfer_many(
-  struct array *const *const A, const unsigned *const size, const unsigned An,
-  const int fixed, const int ext, const int set_src, const unsigned p_off,
-  const uint *const restrict proc, const unsigned proc_stride,
-  struct crystal *const cr);
-void sarray_transfer_(struct array *const A, const unsigned size,
-                      const unsigned p_off, const int set_src,
-                      struct crystal *const cr);
-void sarray_transfer_ext_(struct array *const A, const unsigned size,
-                          const uint *const proc, const unsigned proc_stride,
-                          struct crystal *const cr);
-
-#define sarray_transfer(T,A,proc_field,set_src,cr) \
-  sarray_transfer_(A,sizeof(T),offsetof(T,proc_field),set_src,cr)
-
-#define sarray_transfer_ext(T,A,proc,proc_stride,cr) \
-  sarray_transfer_ext_(A,sizeof(T),proc,proc_stride,cr)
-
-#endif
diff --git a/3rdParty/gslib/src/sort.c b/3rdParty/gslib/src/sort.c
deleted file mode 100644
index 5b25f429f..000000000
--- a/3rdParty/gslib/src/sort.c
+++ /dev/null
@@ -1,31 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include "c99.h"
-#include "name.h"
-#include "fail.h"
-#include "types.h"
-#include "mem.h"
-
-#define T unsigned int
-#define SORT_SUFFIX _ui
-#include "sort_imp.h"
-#undef SORT_SUFFIX
-#undef T
-
-#if defined(USE_LONG) || defined(GLOBAL_LONG)
-#  define T unsigned long
-#  define SORT_SUFFIX _ul
-#  include "sort_imp.h"
-#  undef SORT_SUFFIX
-#  undef T
-#endif
-
-#if defined(USE_LONG_LONG) || defined(GLOBAL_LONG_LONG)
-#  define T unsigned long long
-#  define SORT_SUFFIX _ull
-#  include "sort_imp.h"
-#  undef SORT_SUFFIX
-#  undef T
-#endif
diff --git a/3rdParty/gslib/src/sort.h b/3rdParty/gslib/src/sort.h
deleted file mode 100644
index eaeeb957a..000000000
--- a/3rdParty/gslib/src/sort.h
+++ /dev/null
@@ -1,76 +0,0 @@
-#ifndef SORT_H
-#define SORT_H
-
-#if !defined(TYPES_H) || !defined(MEM_H)
-#warning "sort.h" requires "types.h" and "mem.h"
-/* types.h defines uint, ulong
-   mem.h   defines buffer */
-#endif
-
-/*------------------------------------------------------------------------------
-  
-  Sort
-  
-  O(n) stable sort with good performance for all n
-
-  sortv     (uint  *out,  const uint  *A, uint n, uint stride,  buffer *buf)
-  sortv_long(ulong *out,  const ulong *A, uint n, uint stride,  buffer *buf)
-
-  sortp     (buffer *buf, int perm_start,  const uint  *A, uint n, uint stride)
-  sortp_long(buffer *buf, int perm_start,  const ulong *A, uint n, uint stride)
-
-  A, n, stride : specifices the input (stride is in bytes!)
-  out : the sorted values on output
-
-  For the value sort, (sortv*)
-    A and out may alias (A == out) exactly when stride == sizeof(T)
-
-  For the permutation sort, (sortp*)
-    The permutation can be both input (when start_perm!=0) and output,
-    following the convention that it is always at the start of the buffer buf:
-      uint *perm = buf->ptr;
-    The permutation denotes the ordering
-      A[perm[0]], A[perm[1]], ..., A[perm[n-1]]
-    (assuming stride == sizeof(uint) or sizeof(ulong) as appropriate)
-    and is re-arranged stably to give a sorted ordering.
-    Specifying start_perm==0 is equivalent to specifying
-      perm[i] = i,   i=0,...,n-1
-    for an initial permutation (but may be faster).
-    The buffer will be expanded as necessary to accomodate the permutation
-    and the required scratch space.
-  
-  Most code calls these routines indirectly via the higher-level routine
-    sarray_sort for sorting arrays of structs (see "sarray_sort.h").
-
-  ----------------------------------------------------------------------------*/
-
-#define sortv_ui  PREFIXED_NAME(sortv_ui)
-#define sortv_ul  PREFIXED_NAME(sortv_ul)
-#define sortv_ull PREFIXED_NAME(sortv_ull)
-#define sortp_ui  PREFIXED_NAME(sortp_ui)
-#define sortp_ul  PREFIXED_NAME(sortp_ul)
-#define sortp_ull PREFIXED_NAME(sortp_ull)
-
-#define sortv TYPE_LOCAL(sortv_ui,sortv_ul,sortv_ull)
-#define sortp TYPE_LOCAL(sortp_ui,sortp_ul,sortp_ull)
-#define sortv_long TYPE_GLOBAL(sortv_ui,sortv_ul,sortv_ull)
-#define sortp_long TYPE_GLOBAL(sortp_ui,sortp_ul,sortp_ull)
-
-void sortv_ui(unsigned *out, const unsigned *A, uint n, unsigned stride,
-              buffer *restrict buf);
-void sortv_ul(unsigned long *out,
-              const unsigned long *A, uint n, unsigned stride,
-              buffer *restrict buf);
-uint *sortp_ui(buffer *restrict buf, int start_perm,
-               const unsigned *restrict A, uint n, unsigned stride);
-uint *sortp_ul(buffer *restrict buf, int start_perm,
-               const unsigned long *restrict A, uint n, unsigned stride);
-#if defined(USE_LONG_LONG) || defined(GLOBAL_LONG_LONG)
-void sortv_ull(unsigned long long *out,
-               const unsigned long long *A, uint n, unsigned stride,
-               buffer *restrict buf);
-uint *sortp_ull(buffer *restrict buf, int start_perm,
-                const unsigned long long *restrict A, uint n, unsigned stride);
-#endif
-
-#endif
diff --git a/3rdParty/gslib/src/sort_imp.h b/3rdParty/gslib/src/sort_imp.h
deleted file mode 100644
index e772f91aa..000000000
--- a/3rdParty/gslib/src/sort_imp.h
+++ /dev/null
@@ -1,544 +0,0 @@
-#if !defined(T) || !defined(SORT_SUFFIX)
-#error sort_imp.h not meant to be compiled by itself
-#endif
-
-#define sort_data       TOKEN_PASTE(sort_data      ,SORT_SUFFIX)
-#define radix_count     TOKEN_PASTE(radix_count    ,SORT_SUFFIX)
-#define radix_offsets   TOKEN_PASTE(radix_offsets  ,SORT_SUFFIX)
-#define radix_zeros     TOKEN_PASTE(radix_zeros    ,SORT_SUFFIX)
-#define radix_passv     TOKEN_PASTE(radix_passv    ,SORT_SUFFIX)
-#define radix_sortv     TOKEN_PASTE(radix_sortv    ,SORT_SUFFIX)
-#define radix_passp0_b  TOKEN_PASTE(radix_passp0_b ,SORT_SUFFIX)
-#define radix_passp_b   TOKEN_PASTE(radix_passp_b  ,SORT_SUFFIX)
-#define radix_passp_m   TOKEN_PASTE(radix_passp_m  ,SORT_SUFFIX)
-#define radix_passp_e   TOKEN_PASTE(radix_passp_e  ,SORT_SUFFIX)
-#define radix_passp0_be TOKEN_PASTE(radix_passp0_be,SORT_SUFFIX)
-#define radix_passp_be  TOKEN_PASTE(radix_passp_be, SORT_SUFFIX)
-#define radix_sortp     TOKEN_PASTE(radix_sortp    ,SORT_SUFFIX)
-#define merge_sortv     TOKEN_PASTE(merge_sortv    ,SORT_SUFFIX)
-#define merge_copy_perm TOKEN_PASTE(merge_copy_perm,SORT_SUFFIX)
-#define merge_sortp0    TOKEN_PASTE(merge_sortp0   ,SORT_SUFFIX)
-#define merge_sortp     TOKEN_PASTE(merge_sortp    ,SORT_SUFFIX)
-#define heap_sortv      TOKEN_PASTE(heap_sortv     ,SORT_SUFFIX)
-
-#define sortv PREFIXED_NAME(TOKEN_PASTE(sortv,SORT_SUFFIX))
-#define sortp PREFIXED_NAME(TOKEN_PASTE(sortp,SORT_SUFFIX))
-
-typedef struct { T v; uint i; } sort_data;
-
-#define INC_PTR(A,stride) ((A)=(T*)((char*)(A)+(stride)))
-#define INDEX_PTR(A,stride,i) (*(T*)((char*)(A)+(i)*(stride)))
-
-/*------------------------------------------------------------------------------
-
-  Radix Sort
-
-  stable; O(n+k) time and extra storage
-    where k = (digits in an int) * 2^(bits per digit)
-    (e.g. k = 4 * 256 = 1024 for 32-bit ints with 8-bit digits)
-
-  brief description:
-    input sorted stably on each digit, starting with the least significant
-    counting sort is used for each digit:
-      a pass through the input counts the occurences of each digit value
-      on a second pass, each input has a known destination
-
-  tricks:
-    all counting passes are combined into one
-    the counting pass also computes the inclusive bit-wise or of all inputs,
-      which is used to skip digit positions for which all inputs have zeros
-
-  ----------------------------------------------------------------------------*/
-
-#define STATIC_DIGIT_BUCKETS 1
-
-#define DIGIT_BITS   8
-#define DIGIT_VALUES (1<<DIGIT_BITS)
-#define DIGIT_MASK   ((T)(DIGIT_VALUES-1))
-#define CEILDIV(a,b) (((a)+(b)-1)/(b))
-#define DIGITS       CEILDIV(CHAR_BIT*sizeof(T),DIGIT_BITS)
-#define VALUE_BITS   (DIGIT_BITS*DIGITS)
-#define COUNT_SIZE   (DIGITS*DIGIT_VALUES)
-
-/* used to unroll a tiny loop: */
-#define COUNT_DIGIT_01(n,i) \
-    if(n>i) count[i][val&DIGIT_MASK]++, val>>=DIGIT_BITS
-#define COUNT_DIGIT_02(n,i) COUNT_DIGIT_01(n,i); COUNT_DIGIT_01(n,i+ 1)
-#define COUNT_DIGIT_04(n,i) COUNT_DIGIT_02(n,i); COUNT_DIGIT_02(n,i+ 2)
-#define COUNT_DIGIT_08(n,i) COUNT_DIGIT_04(n,i); COUNT_DIGIT_04(n,i+ 4)
-#define COUNT_DIGIT_16(n,i) COUNT_DIGIT_08(n,i); COUNT_DIGIT_08(n,i+ 8)
-#define COUNT_DIGIT_32(n,i) COUNT_DIGIT_16(n,i); COUNT_DIGIT_16(n,i+16)
-#define COUNT_DIGIT_64(n,i) COUNT_DIGIT_32(n,i); COUNT_DIGIT_32(n,i+32)
-
-static T radix_count(
-  uint (*restrict count)[DIGIT_VALUES],
-  const T *restrict A, const T *const end, const unsigned stride)
-{
-  T bitorkey = 0;
-  memset(count,0,COUNT_SIZE*sizeof(uint));
-  do {
-    T val=*A;
-    bitorkey|=val;
-    COUNT_DIGIT_64(DIGITS,0);
-    /* above macro expands to:
-    if(DIGITS> 0) count[ 0][val&DIGIT_MASK]++, val>>=DIGIT_BITS;
-    if(DIGITS> 1) count[ 1][val&DIGIT_MASK]++, val>>=DIGIT_BITS;
-      ...
-    if(DIGITS>63) count[63][val&DIGIT_MASK]++, val>>=DIGIT_BITS;
-    */
-  } while(INC_PTR(A,stride),A!=end);
-  return bitorkey;
-}
-
-#undef COUNT_DIGIT_01
-#undef COUNT_DIGIT_02
-#undef COUNT_DIGIT_04
-#undef COUNT_DIGIT_08
-#undef COUNT_DIGIT_16
-#undef COUNT_DIGIT_32
-#undef COUNT_DIGIT_64
-
-static void radix_offsets(uint *restrict c)
-{
-  uint *const ce = c+DIGIT_VALUES;
-  uint sum = 0;
-  do {
-    const uint c0=c[0], c1=c[1], c2=c[2], c3=c[3];
-    const uint o1=sum+c0, o2=o1+c1, o3=o2+c2;
-    c[0]=sum, c[1]=o1, c[2]=o2, c[3]=o3;
-    sum = o3+c3;
-    c+=4;
-  } while(c!=ce);
-}
-
-static unsigned radix_zeros(
-  T bitorkey, uint (*restrict count)[DIGIT_VALUES],
-  unsigned *restrict shift, uint **restrict offsets)
-{
-  unsigned digits=0, sh=0; uint *c = &count[0][0];
-  do {
-    if(bitorkey&DIGIT_MASK) *shift++ = sh, *offsets++ = c, ++digits,
-                            radix_offsets(c);
-  } while(bitorkey>>=DIGIT_BITS,sh+=DIGIT_BITS,c+=DIGIT_VALUES,sh!=VALUE_BITS);
-  return digits;
-}
-
-static void radix_passv(
-  const T *restrict A, const T *const end, const unsigned stride,
-  const unsigned sh, uint *const restrict off, T *const restrict out)
-{
-  do out[off[(*A>>sh)&DIGIT_MASK]++] = *A; while(INC_PTR(A,stride),A!=end);
-}
-
-static void radix_sortv(
-  T *out, const T *A, const uint n, const unsigned stride,
-  T *work, uint (*restrict count)[DIGIT_VALUES])
-{
-  const T *const end = &INDEX_PTR(A,stride,n);
-  T bitorkey = radix_count(count, A,end,stride);
-  unsigned shift[DIGITS]; uint *offsets[DIGITS];
-  const unsigned digits = radix_zeros(bitorkey,count,shift,offsets);
-  if(digits==0) {
-    memset(out,0,n*sizeof(T));
-  } else {
-    T *src, *dst; unsigned d;
-    if(out==A || (digits&1)==0) dst=out,src=work;
-                           else src=out,dst=work;
-    radix_passv(A,end,stride,shift[0],offsets[0],src);
-    for(d=1;d!=digits;++d) {
-      T *t;
-      radix_passv(src,src+n,sizeof(T),shift[d],offsets[d],dst);
-      t=src,src=dst,dst=t;
-    }
-    if(src!=out) memcpy(out,src,n*sizeof(T));
-  }
-}
-
-static void radix_passp0_b(
-  const T *restrict A, const uint n, const unsigned stride,
-  const unsigned sh, uint *const restrict off,
-  sort_data *const restrict out)
-{
-  uint i=0;
-  do {
-    T v = *A;
-    sort_data *d = &out[off[(v>>sh)&DIGIT_MASK]++];
-    d->v=v, d->i=i++;
-  } while(INC_PTR(A,stride),i!=n);
-}
-
-static void radix_passp_b(
-  const uint *restrict p,
-  const T *const restrict A, const uint n, const unsigned stride,
-  const unsigned sh, uint *const restrict off,
-  sort_data *const out)
-{
-  const uint *const pe = p+n;
-  do {
-    uint j = *p++;
-    T v = INDEX_PTR(A,stride,j);
-    sort_data *d = &out[off[(v>>sh)&DIGIT_MASK]++];
-    d->v=v, d->i=j;
-  } while(p!=pe);
-}
-
-static void radix_passp_m(
-  const sort_data *restrict src, const sort_data *const end,
-  const unsigned sh, uint *const restrict off,
-  sort_data *const restrict out)
-{
-  do {
-    sort_data *d = &out[off[(src->v>>sh)&DIGIT_MASK]++];
-    d->v=src->v,d->i=src->i;
-  } while(++src!=end);
-}
-
-static void radix_passp_e(
-  const sort_data *restrict src, const sort_data *const end,
-  const unsigned sh, uint *const restrict off,
-  uint *const restrict out)
-{
-  do out[off[(src->v>>sh)&DIGIT_MASK]++]=src->i; while(++src!=end);
-}
-
-static void radix_passp0_be(
-  uint *const restrict out,
-  const T *restrict A, const uint n, const unsigned stride,
-  const unsigned sh, uint *const restrict off)
-{
-  uint i=0;
-  do out[off[(*A>>sh)&DIGIT_MASK]++]=i++; while(INC_PTR(A,stride),i!=n);
-}
-
-static void radix_passp_be(
-  uint *restrict p,
-  const T *restrict A, const uint n, const unsigned stride,
-  const unsigned sh, uint *const restrict off,
-  sort_data *restrict work)
-{
-  uint *q = p, *const qe = p+n;
-  uint *w = &work[0].i;
-  do {
-    uint j = *q++;
-    T v = INDEX_PTR(A,stride,j);
-    w[off[(v>>sh)&DIGIT_MASK]++]=j;
-  } while(q!=qe);
-  memcpy(p,w,n*sizeof(uint));
-}
-
-static void radix_sortp(
-  uint *restrict idx, uint perm_start,
-  const T *restrict A, const uint n, const unsigned stride,
-  sort_data *restrict work,
-  uint (*restrict count)[DIGIT_VALUES])
-{
-  T bitorkey = radix_count(count, A,&INDEX_PTR(A,stride,n),stride);
-  unsigned shift[DIGITS]; uint *offsets[DIGITS];
-  unsigned digits = radix_zeros(bitorkey,count,shift,offsets);
-  if(digits==0) {
-    if(!perm_start) { uint i=0; do *idx++=i++; while(i!=n); }
-  } else if(digits==1) {
-    if(perm_start) radix_passp_be (idx,A,n,stride,shift[0],offsets[0],work);
-              else radix_passp0_be(idx,A,n,stride,shift[0],offsets[0]);
-  } else {
-    sort_data *src, *dst; unsigned d;
-    if((digits&1)==0) dst=work,src=dst+n;
-                 else src=work,dst=src+n;
-    if(perm_start) radix_passp_b (idx,A,n,stride,shift[0],offsets[0],src);
-              else radix_passp0_b(    A,n,stride,shift[0],offsets[0],src);
-    for(d=1;d!=digits-1;++d) {
-      sort_data *t;
-      radix_passp_m(src,src+n,shift[d],offsets[d],dst);
-      t=src,src=dst,dst=t;
-    }
-    radix_passp_e(src,src+n,shift[d],offsets[d],idx);
-  }
-}
-
-/*------------------------------------------------------------------------------
-
-  Merge Sort
-
-  stable; O(n log n) time
-
-  ----------------------------------------------------------------------------*/
-
-#define MERGE_2(p,v)                           \
-  if(VAL(v[1])<VAL(v[0])) p[0]=v[1],p[1]=v[0]; \
-                     else p[0]=v[0],p[1]=v[1]
-#define MERGE_3(p,v) do                                              \
-  if(VAL(v[1])<VAL(v[0])) {                                          \
-    if(VAL(v[2])<VAL(v[1]))        p[0]=v[2],p[1]=v[1],p[2]=v[0];    \
-    else { if(VAL(v[2])<VAL(v[0])) p[0]=v[1],p[1]=v[2],p[2]=v[0];    \
-                              else p[0]=v[1],p[1]=v[0],p[2]=v[2]; }  \
-  } else {                                                           \
-     if(VAL(v[2])<VAL(v[0]))        p[0]=v[2],p[1]=v[0],p[2]=v[1];   \
-     else { if(VAL(v[2])<VAL(v[1])) p[0]=v[0],p[1]=v[2],p[2]=v[1];   \
-                               else p[0]=v[0],p[1]=v[1],p[2]=v[2]; } \
-  } while(0)
-#define MERGE_SORT() \
-  do {                                                                 \
-    uint i=0, n=An, odd=0, c=0, b=1;                                   \
-    sint base=-n;                                                      \
-    for(;;) {                                                          \
-      DATA *restrict p;                                                \
-      if((c&1)==0) {                                                   \
-        base+=n, n+=(odd&1), c|=1, b^=1;                               \
-        while(n>3) odd<<=1,odd|=(n&1),n>>=1,c<<=1,b^=1;                \
-      } else                                                           \
-        base-=n-(odd&1),n<<=1,n-=(odd&1),odd>>=1,c>>=1;                \
-      if(c==0) break;                                                  \
-      p = buf[b]+base;                                                 \
-      if(n==2) {                                                       \
-        DATA v[2]; SETVAL(v[0],i), SETVAL(v[1],i+1);                   \
-        MERGE_2(p,v);                                                  \
-        i+=2;                                                          \
-      } else if(n==3) {                                                \
-        DATA v[3]; SETVAL(v[0],i), SETVAL(v[1],i+1), SETVAL(v[2],i+2); \
-        MERGE_3(p,v);                                                  \
-        i+=3;                                                          \
-      } else {                                                         \
-        const uint na = n>>1, nb = (n+1)>>1;                           \
-        const DATA *restrict ap = buf[b^1]+base, *const ae = ap+na;    \
-        DATA *restrict bp = p+na, *const be = bp+nb;                   \
-        for(;;) {                                                      \
-          if(VAL((*bp))<VAL((*ap))) {                                  \
-            *p++=*bp++;                                                \
-            if(bp!=be) continue;                                       \
-            do *p++=*ap++; while(ap!=ae);                              \
-            break;                                                     \
-          } else {                                                     \
-            *p++=*ap++;                                                \
-            if(ap==ae) break;                                          \
-          }                                                            \
-        }                                                              \
-      }                                                                \
-    }                                                                  \
-  } while(0)
-
-static void merge_sortv(
-  T *restrict out,
-  const T *restrict A, const uint An, const unsigned stride,
-  T *restrict work)
-{
-  T *buf[2]; buf[0]=out, buf[1]=work;
-#define DATA T
-#define VAL(x) x
-#define SETVAL(x,ai) x=*A,INC_PTR(A,stride)
-  MERGE_SORT();
-#undef SETVAL
-#undef VAL
-#undef DATA
-}
-
-static void merge_copy_perm(
-  uint *restrict idx, const sort_data *restrict p, uint n)
-{
-  /*const sort_data *pe = p+n;
-  do *idx++ = (p++)->i; while(p!=pe);*/
-  uint n_by_8 = (n+7)/8;
-  switch(n%8) {
-    case 0: do { *idx++ = (p++)->i;
-    case 7:      *idx++ = (p++)->i;
-    case 6:      *idx++ = (p++)->i;
-    case 5:      *idx++ = (p++)->i;
-    case 4:      *idx++ = (p++)->i;
-    case 3:      *idx++ = (p++)->i;
-    case 2:      *idx++ = (p++)->i;
-    case 1:      *idx++ = (p++)->i;
-    } while (--n_by_8 > 0);
-  }
-}
-
-static void merge_sortp0(
-  uint *restrict idx,
-  const T *restrict A, const uint An, const unsigned stride,
-  sort_data *restrict work)
-{
-  sort_data *buf[2]; buf[0]=work+An,buf[1]=work;
-#define DATA sort_data
-#define VAL(x) x.v
-#define SETVAL(x,ai) x.v=*A,INC_PTR(A,stride),x.i=ai
-  MERGE_SORT();
-#undef SETVAL
-#undef VAL
-#undef DATA
-  merge_copy_perm(idx,buf[0],An);
-}
-
-static void merge_sortp(
-  uint *restrict idx,
-  const T *const restrict A, const uint An, const unsigned stride,
-  sort_data *restrict work)
-{
-  sort_data *buf[2]; buf[0]=work+An,buf[1]=work;
-#define DATA sort_data
-#define VAL(x) x.v
-#define SETVAL(x,ai) x.i=idx[ai],x.v=INDEX_PTR(A,stride,x.i)
-  MERGE_SORT();
-#undef SETVAL
-#undef VAL
-#undef DATA
-  merge_copy_perm(idx,buf[0],An);
-}
-
-#undef MERGE_SORT
-#undef MERGE_3
-#undef MERGE_2
-
-/*------------------------------------------------------------------------------
-
-  Heap Sort
-
-  in-place, stability unobservable; O(n log n) time
-
-  ----------------------------------------------------------------------------*/
-static void heap_sortv(T *const restrict A, unsigned n)
-{
-  unsigned i;
-  /* build heap */
-  for(i=1;i<n;++i) {
-    T item = A[i];
-    unsigned h=i, p = (h-1)>>1;
-    if(A[p] >= item) continue;
-    do A[h]=A[p], h=p, p=(p-1)>>1; while(h && A[p] < item);
-    A[h] = item;
-  }
-  /* extract */
-  for(i=n-1;i;--i) {
-    T item = A[i];
-    unsigned h = 0;
-    A[i] = A[0];
-    for(;;) {
-      unsigned ch = 1+(h<<1), r = ch+1;
-      if(r<i && A[ch] < A[r]) ch=r;
-      if(ch>=i || item >= A[ch]) break;
-      A[h]=A[ch], h=ch;
-    }
-    A[h] = item;
-  }
-}
-
-
-/*------------------------------------------------------------------------------
-
-  Hybrid Stable Sort
-
-  low-overhead merge sort when n is small,
-  otherwise asymptotically superior radix sort
-
-  result = O(n) sort with good performance for all n
-
-  A, n, stride : specifices the input, stride in bytes
-  out : the sorted values on output
-
-  For the value sort,
-    A and out may alias (A == out) exactly when stride == sizeof(T),
-      in which case heap sort is used for small sizes
-
-  For the permutation sort,
-    the permutation can be both input (when start_perm!=0) and output,
-    following the convention that it is always at the start of the buffer buf;
-    the buffer will be expanded as necessary to accomodate the permutation
-    and the required scratch space
-
-  ----------------------------------------------------------------------------*/
-
-void sortv(T *out, const T *A, uint n, unsigned stride, buffer *restrict buf)
-{
-  if(n<DIGIT_VALUES) {
-    if(n<2) {
-      if(n==0) return;
-      *out = *A;
-    } else {
-      if(out==A) {
-        if(stride!=sizeof(T))
-          fail(1,__FILE__,__LINE__,"in-place sort with non-unit stride");
-        heap_sortv(out,n);
-      } else {
-        buffer_reserve(buf,n*sizeof(T));
-        merge_sortv(out, A,n,stride, (T*)buf->ptr);
-      }
-    }
-  } else if(STATIC_DIGIT_BUCKETS) {
-    static uint count[DIGITS][DIGIT_VALUES];
-    buffer_reserve(buf,n*sizeof(T));
-    radix_sortv(out, A,n,stride, (T*)buf->ptr,count);
-  } else {
-    T *restrict work;
-    uint (*restrict count)[DIGIT_VALUES];
-    const size_t count_off=align_as(uint,n*sizeof(T));
-    buffer_reserve(buf,count_off+sizeof(uint[DIGITS][DIGIT_VALUES]));
-    work = buf->ptr;
-    count = (uint(*)[DIGIT_VALUES])((char*)buf->ptr+count_off);
-    radix_sortv(out, A,n,stride, work,count);
-  }
-}
-
-uint *sortp(buffer *restrict buf, int start_perm,
-            const T *restrict A, uint n, unsigned stride)
-{
-  uint *restrict perm;
-  sort_data *restrict work;
-  size_t work_off=align_as(sort_data,n*sizeof(uint));
-  if(n<DIGIT_VALUES) {
-    buffer_reserve(buf,work_off+2*n*sizeof(sort_data));
-    perm = buf->ptr;
-    work = (sort_data*)((char*)buf->ptr+work_off);
-    if(n<2) {
-      if(n==1) *perm=0;
-    } else {
-      if(start_perm) merge_sortp (perm, A,n,stride, work);
-      else           merge_sortp0(perm, A,n,stride, work);
-    }
-  } else if(STATIC_DIGIT_BUCKETS){
-    static uint count[DIGITS][DIGIT_VALUES];
-    buffer_reserve(buf,work_off+2*n*sizeof(sort_data));
-    perm = buf->ptr;
-    work = (sort_data*)((char*)buf->ptr+work_off);
-    radix_sortp(perm,start_perm, A,n,stride, work,count);
-  } else {
-    uint (*restrict count)[DIGIT_VALUES];
-    const size_t count_off=align_as(uint,work_off+2*n*sizeof(sort_data));
-    buffer_reserve(buf,count_off+sizeof(uint[DIGITS][DIGIT_VALUES]));
-    perm = buf->ptr;
-    work = (sort_data*)((char*)buf->ptr+work_off);
-    count = (uint(*)[DIGIT_VALUES])((char*)buf->ptr+count_off);
-    radix_sortp(perm,start_perm, A,n,stride, work,count);
-  }
-  return perm;
-}
-
-#undef STATIC_DIGIT_BUCKETS
-
-#undef DIGIT_BITS
-#undef DIGIT_VALUES
-#undef DIGIT_MASK
-#undef CEILDIV
-#undef DIGITS
-#undef VALUE_BITS
-#undef COUNT_SIZE
-
-#undef INDEX_PTR
-#undef INC_PTR
-
-#undef sortp
-#undef sortv
-
-#undef merge_sortp
-#undef merge_sortp0
-#undef merge_sortv
-#undef radix_sortp
-#undef radix_passp_be
-#undef radix_passp0_be
-#undef radix_passp_e
-#undef radix_passp_m
-#undef radix_passp_b
-#undef radix_passp0_b
-#undef radix_sortv
-#undef radix_passv
-#undef radix_zeros
-#undef radix_offsets
-#undef radix_count
-#undef sort_data
-
diff --git a/3rdParty/gslib/src/tensor.c b/3rdParty/gslib/src/tensor.c
deleted file mode 100644
index a72471418..000000000
--- a/3rdParty/gslib/src/tensor.c
+++ /dev/null
@@ -1,82 +0,0 @@
-#include "c99.h"
-#include "name.h"
-#include "types.h"
-
-#if !defined(USE_CBLAS)
-
-#define tensor_dot  PREFIXED_NAME(tensor_dot )
-#define tensor_mtxm PREFIXED_NAME(tensor_mtxm)
-
-/* Matrices are always column-major (FORTRAN style) */
-
-double tensor_dot(const double *a, const double *b, uint n)
-{
-  double sum = 0;
-  for(;n;--n) sum += *a++ * *b++;
-  return sum;
-}
-
-#  if defined(USE_NAIVE_BLAS)
-#    define tensor_mxv  PREFIXED_NAME(tensor_mxv )
-#    define tensor_mtxv PREFIXED_NAME(tensor_mtxv)
-#    define tensor_mxm  PREFIXED_NAME(tensor_mxm )
-
-/* y = A x */
-void tensor_mxv(
-  double *restrict y, uint ny,
-  const double *restrict A, const double *restrict x, uint nx)
-{
-  uint i;
-  for(i=0;i<ny;++i) y[i]=0;
-  for(;nx;--nx) {
-    const double xk = *x++;
-    for(i=0;i<ny;++i) y[i] += (*A++)*xk;
-  }
-}
-
-/* y = A^T x */
-void tensor_mtxv(
-  double *restrict y, uint ny,
-  const double *restrict A, const double *restrict x, uint nx)
-{
-  for(;ny;--ny) {
-    const double *restrict xp = x;
-    uint n = nx;
-    double sum = *A++ * *xp++;
-    for(--n;n;--n) sum += *A++ * *xp++;
-    *y++ = sum;
-  }
-}
-
-/* C = A * B */
-void tensor_mxm(
-  double *restrict C, uint nc,
-  const double *restrict A, uint na, const double *restrict B, uint nb)
-{
-  uint i,j,k;
-  for(i=0;i<nc*nb;++i) C[i]=0;
-  for(j=0;j<nb;++j,C+=nc) {
-    const double *restrict A_ = A;
-    for(k=0;k<na;++k) {
-      const double b = *B++;
-      for(i=0;i<nc;++i) C[i] += (*A_++) * b;
-    }
-  }
-}
-
-#  endif
-
-/* C = A^T * B */
-void tensor_mtxm(
-  double *restrict C, uint nc,
-  const double *restrict A, uint na, const double *restrict B, uint nb)
-{
-  uint i,j;
-  for(j=0;j<nb;++j,B+=na) {
-    const double *restrict A_ = A;
-    for(i=0;i<nc;++i,A_+=na) *C++ = tensor_dot(A_,B,na);
-  }
-}
-
-#endif
-
diff --git a/3rdParty/gslib/src/tensor.h b/3rdParty/gslib/src/tensor.h
deleted file mode 100644
index bb65be12c..000000000
--- a/3rdParty/gslib/src/tensor.h
+++ /dev/null
@@ -1,199 +0,0 @@
-#ifndef TENSOR_H
-#define TENSOR_H
-
-#if !defined(TYPES_H) || !defined(NAME_H)
-#warning "tensor.h" requires "types.h" and "name.h"
-#endif
-
-#if defined(USE_CBLAS)
-#  include <cblas.h>
-#  define tensor_dot(a,b,n) cblas_ddot((int)(n),a,1,b,1)
-#  define tensor_mxv(y,ny,A,x,nx) \
-     cblas_dgemv(CblasColMajor,CblasNoTrans,(int)ny,(int)nx, \
-                 1.0,A,(int)ny,x,1,0.0,y,1)
-#  define tensor_mtxv(y,ny,A,x,nx) \
-     cblas_dgemv(CblasColMajor,CblasTrans,(int)nx,(int)ny, \
-                 1.0,A,(int)nx,x,1,0.0,y,1)
-#  define tensor_mxm(C,nc,A,na,B,nb) \
-     cblas_dgemm(CblasColMajor,CblasNoTrans,CblasNoTrans, \
-                 (int)nc,(int)nb,(int)na,1.0, \
-                 A,(int)nc,B,(int)na,0.0,C,(int)nc)
-#  define tensor_mtxm(C,nc,A,na,B,nb) \
-     cblas_dgemm(CblasColMajor,CblasTrans,CblasNoTrans, \
-                 (int)nc,(int)nb,(int)na,1.0, \
-                 A,(int)na,B,(int)na,0.0,C,(int)nc)
-#else
-#  define tensor_dot  PREFIXED_NAME(tensor_dot )
-#  define tensor_mtxm PREFIXED_NAME(tensor_mtxm)
-double tensor_dot(const double *a, const double *b, uint n);
-
-/* C (nc x nb) = [A (na x nc)]^T * B (na x nb); all column-major */
-void tensor_mtxm(double *C, uint nc,
-                 const double *A, uint na, const double *B, uint nb);
-#  if defined(USE_NAIVE_BLAS)
-#    define tensor_mxv  PREFIXED_NAME(tensor_mxv )
-#    define tensor_mtxv PREFIXED_NAME(tensor_mtxv)
-#    define tensor_mxm  PREFIXED_NAME(tensor_mxm )
-/* y = A x */
-void tensor_mxv(double *y, uint ny, const double *A, const double *x, uint nx);
-
-/* y = A^T x */
-void tensor_mtxv(double *y, uint ny, const double *A, const double *x, uint nx);
-
-/* C (nc x nb) = A (nc x na) * B (na x nb); all column-major */
-void tensor_mxm(double *C, uint nc,
-                const double *A, uint na, const double *B, uint nb);
-#  else
-#    define mxm FORTRAN_NAME(mxm,MXM)
-/* C (na x nc) = A (na x nb) * B (nb x nc); all column-major */
-void mxm(const double *A, const uint *na,
-         const double *B, const uint *nb,
-         double *C, const uint *nc);
-/* C (nc x nb) = A (nc x na) * B (na x nb); all column-major */
-static void tensor_mxm(double *C, uint nc,
-                       const double *A, uint na, const double *B, uint nb)
-{ mxm(A,&nc,B,&na,C,&nb); }
-
-/* y = A x */
-static void tensor_mxv(double *y, uint ny,
-                       const double *A, const double *x, uint nx)
-{ uint one=1; mxm(A,&ny,x,&nx,y,&one); }
-
-/* y = A^T x */
-static void tensor_mtxv(double *y, uint ny,
-                        const double *A, const double *x, uint nx)
-{ uint one=1; mxm(x,&one,A,&nx,y,&ny); }
-
-#  endif
-#endif
-
-/*--------------------------------------------------------------------------
-   1-,2-,3-d Tensor Application of Row Vectors (for Interpolation)
-   
-   the 3d case:
-   v = tensor_i3(Jr,nr, Js,ns, Jt,nt, u, work)
-     gives v = [ Jr (x) Js (x) Jt ] u
-     where Jr, Js, Jt are row vectors (interpolation weights)
-     u is nr x ns x nt in column-major format (inner index is r)
-     v is a scalar
-  --------------------------------------------------------------------------*/
-
-static double tensor_i1(const double *Jr, uint nr, const double *u)
-{
-  return tensor_dot(Jr,u,nr);
-}
-
-/* work holds ns doubles */
-static double tensor_i2(const double *Jr, uint nr,
-                        const double *Js, uint ns,
-                        const double *u, double *work)
-{
-  tensor_mtxv(work,ns, u, Jr,nr);
-  return tensor_dot(Js,work,ns);
-}
-
-/* work holds ns*nt + nt doubles */
-static double tensor_i3(const double *Jr, uint nr,
-                        const double *Js, uint ns,
-                        const double *Jt, uint nt,
-                        const double *u, double *work)
-{
-  double *work2 = work+nt;
-  tensor_mtxv(work2,ns*nt,   u,     Jr,nr);
-  tensor_mtxv(work ,nt   ,   work2, Js,ns);
-  return tensor_dot(Jt,work,nt);
-}
-
-/*--------------------------------------------------------------------------
-   1-,2-,3-d Tensor Application of Row Vectors
-             for simultaneous Interpolation and Gradient computation
-   
-   the 3d case:
-   v = tensor_ig3(g, wtr,nr, wts,ns, wtt,nt, u, work)
-     gives v   = [ Jr (x) Js (x) Jt ] u
-           g_0 = [ Dr (x) Js (x) Jt ] u
-           g_1 = [ Jr (x) Ds (x) Jt ] u
-           g_2 = [ Jr (x) Js (x) Dt ] u
-     where Jr,Dr,Js,Ds,Jt,Dt are row vectors,
-       Jr=wtr, Dr=wtr+nr, etc.
-       (interpolation & derivative weights)
-     u is nr x ns x nt in column-major format (inner index is r)
-     v is a scalar, g is an array of 3 doubles
-  --------------------------------------------------------------------------*/
-
-static double tensor_ig1(double g[1],
-                         const double *wtr, uint nr,
-                         const double *u)
-{
-  g[0] = tensor_dot(wtr+nr,u,nr);
-  return tensor_dot(wtr   ,u,nr);
-}
-
-/* work holds 2*nr doubles */
-static double tensor_ig2(double g[2],
-                         const double *wtr, uint nr,
-                         const double *wts, uint ns,
-                         const double *u, double *work)
-{
-  tensor_mxm(work,nr, u,ns, wts,2);
-  g[0] = tensor_dot(wtr+nr,work   ,nr);
-  g[1] = tensor_dot(wtr   ,work+nr,nr);
-  return tensor_dot(wtr   ,work   ,nr);
-}
-
-/* work holds 2*nr*ns + 3*nr doubles */
-static double tensor_ig3(double g[3],
-                         const double *wtr, uint nr,
-                         const double *wts, uint ns,
-                         const double *wtt, uint nt,
-                         const double *u, double *work)
-{
-  const uint nrs = nr*ns;
-  double *a = work, *b = work+2*nrs, *c=b+2*nr;
-  tensor_mxm(a,nrs, u,nt, wtt,2);
-  tensor_mxm(b,nr,  a,ns, wts,2);
-  tensor_mxv(c,nr, a+nrs, wts,ns);
-  g[0] = tensor_dot(b   , wtr+nr, nr);
-  g[1] = tensor_dot(b+nr, wtr   , nr);
-  g[2] = tensor_dot(c   , wtr   , nr);
-  return tensor_dot(b   , wtr   , nr);
-}
-
-/*
-  out - nr x ns
-  u   - mr x ms
-  Jrt - mr x nr, Jst - ms x ns
-  work - nr x ms
-*/
-static void tensor_2t(double *out,
-                      const double *Jrt, uint nr, uint mr,
-                      const double *Jst, uint ns, uint ms,
-                      const double *u, double *work)
-{
-  tensor_mtxm(work,nr, Jrt,mr, u,ms);
-  tensor_mxm(out,nr, work,ms, Jst,ns);
-}
-
-/*
-  out - nr x ns x nt
-  u   - mr x ms x mt
-  Jrt - mr x nr, Jst - ms x ns, Jtt - mt x nt
-  work - nr*ms*mt + nr*ns*mt = nr*(ms+ns)*mt
-*/
-static void tensor_3t(double *out,
-                      const double *Jrt, uint nr, uint mr,
-                      const double *Jst, uint ns, uint ms,
-                      const double *Jtt, uint nt, uint mt,
-                      const double *u, double *work)
-{
-  const uint nrs=nr*ns, mst=ms*mt, nrms=nr*ms;
-  uint k;
-  double *work2 = work+nr*mst;
-  double *p; const double *q;
-  tensor_mtxm(work,nr, Jrt,mr, u,mst);
-  for(k=0,p=work2,q=work;k<mt;++k,p+=nrs,q+=nrms)
-    tensor_mxm(p,nr, q,ms, Jst,ns);
-  tensor_mxm(out,nrs, work2,mt, Jtt,nt);
-}
-
-#endif
diff --git a/3rdParty/gslib/src/types.h b/3rdParty/gslib/src/types.h
deleted file mode 100644
index 14a94bfa4..000000000
--- a/3rdParty/gslib/src/types.h
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef TYPES_H
-#define TYPES_H
-#include <limits.h>
-
-/* 
-  Define the integer types used throughout the code,
-  controlled by preprocessor macros.
-  
-  The integer type sint/uint (signed/unsigned) is used
-  most frequently, e.g., for indexing into local arrays,
-  and for processor ids. It can be one of
-  
-    macro             sint/uint type
-    
-    (default)         int
-    USE_LONG          long
-    USE_LONG_LONG     long long
-    
-  The slong/ulong type is used in relatively few places
-  for global identifiers and indices. It can be one of
-
-    macro             slong/ulong type
-    
-    (default)         int
-    GLOBAL_LONG       long
-    GLOBAL_LONG_LONG  long long
-
-  Since the long long type is not ISO C90, it is never
-  used unless explicitly asked for.
-
-  The POSIX-standard limits.h header provides the
-  LLONG_MAX and LLONG_MIN macros, which will be
-  preferentially used.  
-
-*/
-
-#if defined(USE_LONG_LONG) || defined(GLOBAL_LONG_LONG)
-typedef long long long_long;
-#  define WHEN_LONG_LONG(x) x
-#  if !defined(LLONG_MAX)
-#    if defined(LONG_LONG_MAX)
-#      define LLONG_MAX LONG_LONG_MAX
-#    else
-#      define LLONG_MAX 9223372036854775807
-#    endif
-#  endif
-#  if !defined(LLONG_MIN)
-#    if defined(LONG_LONG_MIN)
-#      define LLONG_MIN LONG_LONG_MIN
-#    else
-#      define LLONG_MIN -9223372036854775807
-#    endif
-#  endif
-#else
-#  define WHEN_LONG_LONG(x)
-#endif
-
-#if !defined(USE_LONG) && !defined(USE_LONG_LONG)
-#  define TYPE_LOCAL(i,l,ll) i
-#elif defined(USE_LONG)
-#  define TYPE_LOCAL(i,l,ll) l
-#elif defined(USE_LONG_LONG)
-#  define TYPE_LOCAL(i,l,ll) ll
-#endif
-
-#if !defined(GLOBAL_LONG) && !defined(GLOBAL_LONG_LONG)
-#  define TYPE_GLOBAL(i,l,ll) i
-#elif defined(GLOBAL_LONG)
-#  define TYPE_GLOBAL(i,l,ll) l
-#else
-#  define TYPE_GLOBAL(i,l,ll) ll
-#endif
-
-/* local integer type: for quantities O(N/P) */
-#define sint   signed TYPE_LOCAL(int,long,long long)
-#define uint unsigned TYPE_LOCAL(int,long,long long)
-#define iabs TYPE_LOCAL(abs,labs,llabs)
-
-/* global integer type: for quantities O(N) */
-#define slong   signed TYPE_GLOBAL(int,long,long long)
-#define ulong unsigned TYPE_GLOBAL(int,long,long long)
-#define iabsl TYPE_GLOBAL(abs,labs,llabs)
-
-#endif
-
diff --git a/3rdParty/gslib/tests/comm_test.c b/3rdParty/gslib/tests/comm_test.c
deleted file mode 100644
index 49b1af0f8..000000000
--- a/3rdParty/gslib/tests/comm_test.c
+++ /dev/null
@@ -1,37 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include "name.h"
-#include "fail.h"
-#include "types.h"
-#include "gs_defs.h"
-#include "comm.h"
-
-int main(int narg, char *arg[])
-{
-  comm_ext world; int np;
-  struct comm comm;
-  ulong sum[2],r[2],v, test;
-#ifdef MPI
-  MPI_Init(&narg,&arg);
-  world = MPI_COMM_WORLD;
-  MPI_Comm_size(world,&np);
-#else
-  world=0, np=1;
-#endif
-
-  comm_init(&comm,world);
-  
-  v = comm.id+1;
-  test = comm_reduce_slong(&comm,gs_add,(slong*)&v,1);
-  comm_scan(sum, &comm,gs_slong,gs_add, &v,1, r);
-  printf("%02d: %d %d %d\n",(int)comm.id,(int)sum[0],(int)sum[1],(int)test);
-
-  comm_free(&comm);
-  
-#ifdef MPI
-  MPI_Finalize();
-#endif
-
-  return 0;
-}
diff --git a/3rdParty/gslib/tests/crystal_test.c b/3rdParty/gslib/tests/crystal_test.c
deleted file mode 100644
index c7f50df64..000000000
--- a/3rdParty/gslib/tests/crystal_test.c
+++ /dev/null
@@ -1,88 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include "c99.h"
-#include "name.h"
-#include "fail.h"
-#include "types.h"
-#include "comm.h"
-#include "mem.h"
-#include "crystal.h"
-
-int main(int narg, char *arg[])
-{
-  comm_ext world; int np;
-  struct comm comm;
-  struct crystal cr;
-  uint i,sum, *data, *end;
-#ifdef MPI
-  MPI_Init(&narg,&arg);
-  world = MPI_COMM_WORLD;
-  MPI_Comm_size(world,&np);
-#else
-  world=0, np=1;
-#endif
-
-  comm_init(&comm,world);
-  
-  crystal_init(&cr,&comm);
-
-  cr.data.n = (4+(comm.id&1))*comm.np;
-  buffer_reserve(&cr.data,cr.data.n*sizeof(uint));
-  data = cr.data.ptr;
-  for(i=0;i<comm.np;++i, data+=3+data[2]) {
-    data[0] = i, data[1] = comm.id, data[2] = 1;
-    data[3] = 2*comm.id;
-    if(comm.id&1) data[2] = 2, data[4] = data[3]+1;
-  }
-
-#if 0
-  data = cr.data.ptr, end = data + cr.data.n;
-  for(;data!=end; data+=3+data[2]) {
-    uint i;
-    printf("%u -> %u:",data[1],data[0]);
-    for(i=0;i<data[2];++i) printf(" %u",data[3+i]);
-    printf("\n");
-  }
-#endif
-  
-  crystal_router(&cr);
-
-#if 0
-  printf("\n");
-  data = cr.data.ptr, end = data + cr.data.n;
-  for(;data!=end; data+=3+data[2]) {
-    uint i;
-    printf("%u <- %u:",data[0],data[1]);
-    for(i=0;i<data[2];++i) printf(" %u",data[3+i]);
-    printf("\n");
-  }
-#endif
-  
-  if(cr.data.n != comm.np*4 + (comm.np/2))
-    fail(1,__FILE__,__LINE__,"failure on %u",comm.id);
-  sum = 0;
-  data = cr.data.ptr, end = data + cr.data.n;
-  for(;data!=end; data+=3+data[2]) {
-    sum+=data[1];
-    if(data[3]!=data[1]*2)
-      fail(1,__FILE__,__LINE__,"failure on %u",comm.id);
-    if(data[1]&1 && (data[2]!=2 || data[4]!=data[3]+1))
-      fail(1,__FILE__,__LINE__,"failure on %u",comm.id);
-  }
-  if(sum != comm.np*(comm.np-1)/2)
-    fail(1,__FILE__,__LINE__,"failure on %u",comm.id);
-
-  crystal_free(&cr);
-  comm_free(&comm);
-
-  diagnostic("",__FILE__,__LINE__,
-    "test successful %u/%u",(unsigned)comm.id,(unsigned)comm.np);
-  
-#ifdef MPI
-  MPI_Finalize();
-#endif
-
-  return 0;
-}
diff --git a/3rdParty/gslib/tests/findpts_el_2_test.c b/3rdParty/gslib/tests/findpts_el_2_test.c
deleted file mode 100644
index 7a274aaab..000000000
--- a/3rdParty/gslib/tests/findpts_el_2_test.c
+++ /dev/null
@@ -1,73 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <float.h>
-#include <math.h>
-#include <string.h>
-#include "c99.h"
-#include "name.h"
-#include "types.h"
-#include "fail.h"
-#include "mem.h"
-#include "poly.h"
-#include "findpts_el.h"
-
-#define NR 14
-#define NS 7
-
-static const unsigned nr[3]={NR,NS};
-
-static double elx[NR*NS], ely[NR*NS];
-static const double *const elx2[2] = {elx,ely};
-
-int main()
-{
-  int pass=1;
-  unsigned i,j;
-  double zr[NR], zs[NS];
-  struct findpts_el_data_2 fd;
-  struct findpts_el_pt_2 *pt;
-  findpts_el_setup_2(&fd,nr,NR*NS);
-  pt = findpts_el_points_2(&fd);
-  
-  lobatto_nodes(zr,NR);
-  lobatto_nodes(zs,NS);
-
-  for(j=0;j<NS;++j) for(i=0;i<NR;++i)
-    elx[j*NR+i] = zr[i],
-    ely[j*NR+i] = zs[j];
-
-  findpts_el_start_2(&fd, elx2);
-
-  for(j=0;j<NS;++j) for(i=0;i<NR;++i) {
-    struct findpts_el_pt_2 *p = pt + j*NR+i;
-    p->x[0] = zr[i]*2, p->x[1] = zs[j]*2;
-    p->r[0] = 0, p->r[1] = 0;
-    p->flags = 0;
-  }
-
-  findpts_el_2(&fd, NR*NS, 1024*DBL_EPSILON);
-
-  for(j=0;j<NS;++j) for(i=0;i<NR;++i) {
-    double r,s;
-    struct findpts_el_pt_2 *p = pt + j*NR+i;
-    printf("x = (%g,%g), r = (%g,%g), flags = %x, dist2 = %g\n",
-      p->x[0],p->x[1], p->r[0],p->r[1],
-      p->flags, p->dist2);
-    #define CLAMP(x,r) \
-      do { double temp=r; x = temp<-1?-1:(temp>1?1:temp); } while(0)
-    CLAMP(r,zr[i]*2); CLAMP(s,zs[j]*2);
-    #undef CLAMP
-    if( fabs(r-p->r[0])+fabs(s-p->r[1]) > 1024*DBL_EPSILON )
-      { printf("off by %g\n", fabs(r-p->r[0])+fabs(s-p->r[1]));
-        pass=0; goto fin; }
-  }
-
-fin:
-  
-  findpts_el_free_2(&fd);
-
-  printf("Tests %s\n", pass?"passed":"failed");
-
-  return 0;
-}
diff --git a/3rdParty/gslib/tests/findpts_el_2_test2.c b/3rdParty/gslib/tests/findpts_el_2_test2.c
deleted file mode 100644
index 6942da0c2..000000000
--- a/3rdParty/gslib/tests/findpts_el_2_test2.c
+++ /dev/null
@@ -1,97 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <float.h>
-#include <string.h>
-#include "c99.h"
-#include "types.h"
-#include "name.h"
-#include "fail.h"
-#include "mem.h"
-#include "tensor.h"
-#include "poly.h"
-#include "lob_bnd.h"
-#include "obbox.h"
-#include "findpts_el.h"
-#include "rand_elt_test.h"
-
-#define REPEAT 10000
-
-#define  NR 7
-#define TNR 8
-#define  NS 8
-#define TNS 9
-#define TNTOT (TNR*TNS)
-#define  MR (4*NR)
-#define  MS (4*NS)
-
-static const unsigned nr[2] = {NR,NS};
-
-/* #define NPT 1 */
-#define NPT 256
-/* #define NPT TNR*TNS */
-
-#define TOL 1024*DBL_EPSILON
-
-static double zr[NR], zs[NS];
-static double tzr[TNR], tzs[TNS];
-static double Jr[NR*TNR],Js[NS*TNS];
-static double elx[NR*NS], ely[NR*NS];
-static const double *const elxy[2] = {elx,ely};
-static double telx[2][TNR*TNS];
-static double work[TNR*NS];
-
-int main()
-{
-  int failure=0, unconv=0;
-  unsigned n,i,ie;
-
-  struct findpts_el_data_2 fd;
-  struct findpts_el_pt_2 *pt;
-  findpts_el_setup_2(&fd,nr,NPT);
-  pt = findpts_el_points_2(&fd);
-  
-  lobatto_nodes(tzr,TNR), lobatto_nodes(tzs,TNS);
-  lobatto_nodes(zr,NR), lobatto_nodes(zs,NS);
-
-  for(i=0;i<TNR;++i) fd.lag[0](Jr+i*NR, fd.lag_data[0], NR, 0, tzr[i]);
-  for(i=0;i<TNS;++i) fd.lag[1](Js+i*NS, fd.lag_data[1], NS, 0, tzs[i]);
-  for(n=0;n<REPEAT;++n) {
-    rand_elt_2(elx,ely, zr,NR, zs,NS);
-    tensor_2t(telx[0], Jr,TNR,NR, Js,TNS,NS, elx, work);
-    tensor_2t(telx[1], Jr,TNR,NR, Js,TNS,NS, ely, work);
-    findpts_el_start_2(&fd, elxy);
-    for(i=0;i<TNTOT;) {
-      unsigned i0=i;
-      ie = i+NPT, ie = ie>TNTOT ? TNTOT : ie;
-      for(;i!=ie;++i) {
-        struct findpts_el_pt_2 *p = pt+(i-i0);
-        const double x=telx[0][i],y=telx[1][i];
-        p->x[0]=x,p->x[1]=y;
-        p->flags = 0;
-      }
-      findpts_el_2(&fd, ie-i0, 1024*DBL_EPSILON);
-      for(i=i0;i!=ie;++i) {
-        struct findpts_el_pt_2 *p = pt+(i-i0);
-        const double r=tzr[i%TNR], s=tzs[i/TNR];
-        if((p->flags&(1u<<4))==0) ++unconv;
-        if(fabs(p->r[0]-r)+fabs(p->r[1]-s)>1024*DBL_EPSILON) {
-          printf("found (%g,%g) for (%g,%g) ; error (%g,%g)\n",
-            p->r[0],p->r[1], r,s, p->r[0]-r,p->r[1]-s);
-          printf("(%g,%g) for (%.15g,%.15g) ; dist2 = %g\n",
-            p->x[0],p->x[1],
-            telx[0][i],telx[1][i],p->dist2);
-          ++failure;
-        }
-      }
-    }
-  }
-
-  findpts_el_free_2(&fd);
-
-  printf("%u failed points (out of %u)\n", failure, REPEAT*TNTOT);
-  printf("%u unconverged points\n", unconv);
-
-  return !(failure == 39);
-}
diff --git a/3rdParty/gslib/tests/findpts_el_3_test.c b/3rdParty/gslib/tests/findpts_el_3_test.c
deleted file mode 100644
index 54431c339..000000000
--- a/3rdParty/gslib/tests/findpts_el_3_test.c
+++ /dev/null
@@ -1,77 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <float.h>
-#include <math.h>
-#include <string.h>
-#include "c99.h"
-#include "name.h"
-#include "types.h"
-#include "fail.h"
-#include "mem.h"
-#include "poly.h"
-#include "findpts_el.h"
-
-#define NR 14
-#define NS 7
-#define NT 25
-
-static const unsigned nr[3]={NR,NS,NT};
-
-static double elx[NR*NS*NT], ely[NR*NS*NT], elz[NR*NS*NT];
-static const double *const elx3[3] = {elx,ely,elz};
-
-int main()
-{
-  int pass=1;
-  unsigned i,j,k;
-  double zr[NR], zs[NS], zt[NT];
-  struct findpts_el_data_3 fd;
-  struct findpts_el_pt_3 *pt;
-  findpts_el_setup_3(&fd,nr,NR*NS*NT);
-  pt = findpts_el_points_3(&fd);
-  
-  lobatto_nodes(zr,NR);
-  lobatto_nodes(zs,NS);
-  lobatto_nodes(zt,NT);
-
-  for(k=0;k<NT;++k) for(j=0;j<NS;++j) for(i=0;i<NR;++i)
-    elx[(k*NS+j)*NR+i] = zr[i],
-    ely[(k*NS+j)*NR+i] = zs[j],
-    elz[(k*NS+j)*NR+i] = zt[k];
-
-  findpts_el_start_3(&fd, elx3);
-
-  for(k=0;k<NT;++k) for(j=0;j<NS;++j) for(i=0;i<NR;++i) {
-    struct findpts_el_pt_3 *p = pt + (k*NS+j)*NR+i;
-    p->x[0] = zr[i]*2, p->x[1] = zs[j]*2, p->x[2] = zt[k]*2;
-    p->r[0] = 0, p->r[1] = 0, p->r[2] = 0;
-    p->flags = 0;
-  }
-
-  findpts_el_3(&fd, NR*NS*NT, 1024*DBL_EPSILON);
-  /* sort_points(pt,NR*NS*NT); */
-
-  for(k=0;k<NT;++k) for(j=0;j<NS;++j) for(i=0;i<NR;++i) {
-    double r,s,t;
-    struct findpts_el_pt_3 *p = pt + (k*NS+j)*NR+i;
-    printf("x = (%g,%g,%g), r = (%g,%g,%g), flags = %x, dist2 = %g\n",
-      p->x[0],p->x[1],p->x[2], p->r[0],p->r[1],p->r[2],
-      p->flags, p->dist2);
-    #define CLAMP(x,r) \
-      do { double temp=r; x = temp<-1?-1:(temp>1?1:temp); } while(0)
-    CLAMP(r,zr[i]*2); CLAMP(s,zs[j]*2); CLAMP(t,zt[k]*2);
-    #undef CLAMP
-    if( fabs(r-p->r[0])+fabs(s-p->r[1])+fabs(t-p->r[2]) > 1024*DBL_EPSILON )
-      { printf("off by %g\n", fabs(r-p->r[0])+fabs(s-p->r[1])+fabs(t-p->r[2]));
-        pass=0; goto fin; }
-  }
-
-fin:
-  
-  findpts_el_free_3(&fd);
-
-  printf("Tests %s\n", pass?"passed":"failed");
-
-  return 0;
-}
diff --git a/3rdParty/gslib/tests/findpts_el_3_test2.c b/3rdParty/gslib/tests/findpts_el_3_test2.c
deleted file mode 100644
index 627b9c660..000000000
--- a/3rdParty/gslib/tests/findpts_el_3_test2.c
+++ /dev/null
@@ -1,107 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <float.h>
-#include <string.h>
-#include "c99.h"
-#include "types.h"
-#include "name.h"
-#include "fail.h"
-#include "mem.h"
-#include "tensor.h"
-#include "poly.h"
-#include "lob_bnd.h"
-#include "obbox.h"
-#include "findpts_el.h"
-#include "rand_elt_test.h"
-
-#define REPEAT 100
-
-#define  NR 7
-#define TNR 8
-#define  NS 8
-#define TNS 9
-#define  NT 9
-#define TNT 7
-#define TNTOT (TNR*TNS*TNT)
-#define  MR (4*NR)
-#define  MS (4*NS)
-#define  MT (4*NT)
-
-static const unsigned nr[3] = {NR,NS,NT};
-
-/* #define NPT 1 */
-#define NPT 256
-/* #define NPT TNR*TNS*TNT */
-
-#define TOL 1024*DBL_EPSILON
-
-static double zr[NR], zs[NS], zt[NT];
-static double tzr[TNR], tzs[TNS], tzt[TNT];
-static double Jr[NR*TNR],Js[NS*TNS],Jt[NT*TNT];
-static double elx[NR*NS*NT], ely[NR*NS*NT], elz[NR*NS*NT];
-static const double *const elxyz[3] = {elx,ely,elz};
-static double telx[3][TNR*TNS*TNT];
-static double work[TNR*(NS+TNS)*NT];
-
-int main()
-{
-  int failure=0;
-  unsigned n,i,ie;
-
-  int unconv=0;
-
-  struct findpts_el_data_3 fd;
-  struct findpts_el_pt_3 *pt;
-  findpts_el_setup_3(&fd,nr,NPT);
-  pt = findpts_el_points_3(&fd);
-  
-  lobatto_nodes(tzr,TNR), lobatto_nodes(tzs,TNS), lobatto_nodes(tzt,TNT);
-  lobatto_nodes(zr,NR), lobatto_nodes(zs,NS), lobatto_nodes(zt,NT);
-
-  for(i=0;i<TNR;++i) fd.lag[0](Jr+i*NR, fd.lag_data[0], NR, 0, tzr[i]);
-  for(i=0;i<TNS;++i) fd.lag[1](Js+i*NS, fd.lag_data[1], NS, 0, tzs[i]);
-  for(i=0;i<TNT;++i) fd.lag[2](Jt+i*NT, fd.lag_data[2], NT, 0, tzt[i]);
-  for(n=0;n<6+REPEAT;++n) {
-    if(n<6)
-      bubble_elt(elx,ely,elz, zr,NR, zs,NS, zt,NT, n);
-    else
-      rand_elt_3(elx,ely,elz, zr,NR, zs,NS, zt,NT);
-    tensor_3t(telx[0], Jr,TNR,NR, Js,TNS,NS, Jt,TNT,NT, elx, work);
-    tensor_3t(telx[1], Jr,TNR,NR, Js,TNS,NS, Jt,TNT,NT, ely, work);
-    tensor_3t(telx[2], Jr,TNR,NR, Js,TNS,NS, Jt,TNT,NT, elz, work);
-    findpts_el_start_3(&fd, elxyz);
-    for(i=0;i<TNTOT;) {
-      unsigned i0=i;
-      ie = i+NPT, ie = ie>TNTOT ? TNTOT : ie;
-      for(;i!=ie;++i) {
-        struct findpts_el_pt_3 *p = pt+(i-i0);
-        const double x=telx[0][i],y=telx[1][i],z=telx[2][i];
-        p->x[0]=x,p->x[1]=y,p->x[2]=z;
-        p->flags = 0;
-      }
-      findpts_el_3(&fd, ie-i0, 1024*DBL_EPSILON);
-      for(i=i0;i!=ie;++i) {
-        struct findpts_el_pt_3 *p = pt+(i-i0);
-        const double r=tzr[i%TNR], s=tzs[(i/TNR)%TNS], t=tzt[i/(TNR*TNS)];
-        if((p->flags&(1u<<6))==0) ++unconv;
-        if(fabs(p->r[0]-r)+fabs(p->r[1]-s)+fabs(p->r[2]-t)>1024*DBL_EPSILON) {
-          printf("found (%g,%g,%g) for (%g,%g,%g) ; error (%g,%g,%g)\n",
-            p->r[0],p->r[1],p->r[2], r,s,t, p->r[0]-r,p->r[1]-s,p->r[2]-t);
-          printf("(%g,%g,%g) for (%.15g,%.15g,%.15g) ; dist2 = %g\n",
-            p->x[0],p->x[1],p->x[2],
-            telx[0][i],telx[1][i],telx[2][i],p->dist2);
-          ++failure;
-        }
-      }
-    }
-  }
-
-  findpts_el_free_3(&fd);
-
-  printf("%u failed points (out of %u)\n", failure, (6+REPEAT)*TNTOT);
-  printf("%u unconverged points\n", unconv);
-
-  return 0;
-}
diff --git a/3rdParty/gslib/tests/findpts_local_test.c b/3rdParty/gslib/tests/findpts_local_test.c
deleted file mode 100644
index 0ebe144df..000000000
--- a/3rdParty/gslib/tests/findpts_local_test.c
+++ /dev/null
@@ -1,210 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <float.h>
-#include <math.h>
-#include <string.h>
-#include "c99.h"
-#include "name.h"
-#include "fail.h"
-#include "mem.h"
-#include "types.h"
-#include "poly.h"
-#include "obbox.h"
-#include "findpts_el.h"
-#include "findpts_local.h"
-#include "rand_elt_test.h"
-
-#define D 3
-
-#if D==3
-#define INITD(a,b,c) {a,b,c}
-#define MULD(a,b,c) ((a)*(b)*(c))
-#define INDEXD(a,na, b,nb, c) (((c)*(nb)+(b))*(na)+(a))
-#define findpts_local_data  findpts_local_data_3
-#define findpts_local_setup findpts_local_setup_3
-#define findpts_local_free  findpts_local_free_3
-#define findpts_local       findpts_local_3
-#elif D==2
-#define INITD(a,b,c) {a,b}
-#define MULD(a,b,c) ((a)*(b))
-#define INDEXD(a,na, b,nb, c) ((b)*(na)+(a))
-#define findpts_local_data  findpts_local_data_2
-#define findpts_local_setup findpts_local_setup_2
-#define findpts_local_free  findpts_local_free_2
-#define findpts_local       findpts_local_2
-#endif
-
-#define NR 5
-#define NS 8
-#define NT 6
-#define K 4
-#define NEL MULD(K,K,K)
-#define TN 4
-
-#define NPT_MAX 256
-#define BBOX_TOL 0.01
-#define NEWT_TOL 1024*DBL_EPSILON
-#define MAX_HASH_SIZE NEL*MULD(NR,NS,NT)
-
-/*
-#define NPT_MAX 256
-#define BBOX_TOL 1.00
-#define NEWT_TOL 1024*DBL_EPSILON
-#define MAX_HASH_SIZE NEL*3
-*/
-
-static const unsigned nr[D] = INITD(NR,NS,NT);
-static const unsigned mr[D] = INITD(4*NR,4*NS,4*NT);
-static double zr[NR], zs[NS], zt[NT];
-static double x3[D][MULD(3,3,3)];
-static double mesh[D][NEL*MULD(NR,NS,NT)];
-static const double *const elx[D] = INITD(mesh[0],mesh[1],mesh[2]);
-
-static double testx[NEL*MULD(TN,TN,TN)*D];
-struct pt_data { double r[D], dist2; uint code, el; };
-static struct pt_data testp[NEL*MULD(TN,TN,TN)];
-
-static double quad_eval(const double coef[MULD(3,3,3)], const double r[D])
-{
-  double lr0[D], lr1[D], lr2[D];
-  unsigned d;
-  for(d=0;d<D;++d) lr0[d]=r[d]*(r[d]-1)/2,
-                   lr1[d]=(1+r[d])*(1-r[d]),
-                   lr2[d]=r[d]*(r[d]+1)/2;
-  #define EVALR(base) ( coef [base   ]*lr0[0] \
-                       +coef [base+ 1]*lr1[0] \
-                       +coef [base+ 2]*lr2[0] )
-  #define EVALS(base) ( EVALR(base   )*lr0[1] \
-                       +EVALR(base+ 3)*lr1[1] \
-                       +EVALR(base+ 6)*lr2[1] )
-  #define EVALT(base) ( EVALS(base   )*lr0[2] \
-                       +EVALS(base+ 9)*lr1[2] \
-                       +EVALS(base+18)*lr2[2] )
-  #if D==2
-  #  define EVAL() EVALS(0)
-  #elif D==3
-  #  define EVAL() EVALT(0)
-  #endif
-  
-  return EVAL();
-  
-  #undef EVAL
-  #undef EVALT
-  #undef EVALS
-  #undef EVALR
-}
-
-static void rand_mesh(void)
-{
-  const double fac = 1.0/K;
-  const double z3[3] = {-1,0,1};
-  unsigned ki,kj;
-  #if D==3
-  unsigned kk;
-  rand_elt_3(x3[0],x3[1],x3[2], z3,3,z3,3,z3,3);
-  #elif D==2
-  rand_elt_2(x3[0],x3[1],       z3,3,z3,3);
-  #endif
-  #if D==3
-  for(kk=0;kk<K;++kk)
-  #endif
-  for(kj=0;kj<K;++kj) for(ki=0;ki<K;++ki) {
-    unsigned off = INDEXD(ki,K, kj,K, kk)*MULD(NR,NS,NT);
-    unsigned i,j;
-    double r[D], base[D] = INITD(-1+2*fac*ki,-1+2*fac*kj,-1+2*fac*kk);
-    #if D==3
-    unsigned k;
-    for(k=0;k<NT;++k) { r[2] = base[2]+fac*(1+zt[k]);
-    #endif
-    for(j=0;j<NS;++j) { r[1] = base[1]+fac*(1+zs[j]);
-    for(i=0;i<NR;++i) { r[0] = base[0]+fac*(1+zr[i]);
-      mesh[0][off+INDEXD(i,NR, j,NS, k)] = quad_eval(x3[0],r);
-      mesh[1][off+INDEXD(i,NR, j,NS, k)] = quad_eval(x3[1],r);
-      #if D==3
-      mesh[2][off+INDEXD(i,NR, j,NS, k)] = quad_eval(x3[2],r);
-    }
-    #endif
-    }}
-  }
-}
-
-static void test_mesh(void)
-{
-  const double fac = 1.0/K, step = 2.0/(K*(TN-1));
-  unsigned ki,kj;
-  #if D==3
-  unsigned kk;
-  for(kk=0;kk<K;++kk)
-  #endif
-  for(kj=0;kj<K;++kj) for(ki=0;ki<K;++ki) {
-    unsigned off = INDEXD(ki,K, kj,K, kk)*MULD(TN,TN,TN);
-    unsigned i,j;
-    double r[D], base[D] = INITD(-1+2*fac*ki,-1+2*fac*kj,-1+2*fac*kk);
-    #if D==3
-    unsigned k;
-    for(k=0;k<TN;++k) { r[2] = base[2]+step*k;
-    #endif
-    for(j=0;j<TN;++j) { r[1] = base[1]+step*j;
-    for(i=0;i<TN;++i) { r[0] = base[0]+step*i;
-      testx[(off+INDEXD(i,TN, j,TN, k))*D+0] = quad_eval(x3[0],r);
-      testx[(off+INDEXD(i,TN, j,TN, k))*D+1] = quad_eval(x3[1],r);
-      #if D==3
-      testx[(off+INDEXD(i,TN, j,TN, k))*D+2] = quad_eval(x3[2],r);
-    }
-    #endif
-    }}
-  }
-}
-
-static void print_ptdata(void)
-{
-  uint i,notfound=0;
-  double dist2max=0;
-  for(i=0;i<NEL*MULD(TN,TN,TN);++i) {
-    printf("code=%u, el=%u, dist2=%g, r=(%.17g,%.17g"
-           #if D==3
-           ",%.17g"
-           #endif
-           ")\n",
-      testp[i].code,testp[i].el,testp[i].dist2,
-      testp[i].r[0],testp[i].r[1]
-      #if D==3
-      ,testp[i].r[2]
-      #endif
-      );
-    dist2max=testp[i].dist2>dist2max?testp[i].dist2:dist2max;
-    if(testp[i].code==2) ++notfound;
-  }
-  printf("Maximum distance = %g\n%u points not found\n",
-         sqrt(dist2max), (unsigned)notfound);
-}
-
-static void test(buffer *buf)
-{
-  const double *const x_base[D]=INITD(testx,testx+1,testx+2);
-  const unsigned x_stride[D]=
-    INITD(D*sizeof(double),D*sizeof(double),D*sizeof(double));
-  struct findpts_local_data fld;
-  rand_mesh();
-  test_mesh();
-  findpts_local_setup(&fld,elx,nr,NEL,mr,BBOX_TOL,MAX_HASH_SIZE,
-                      NPT_MAX,NEWT_TOL);
-  findpts_local(&testp[0].code , sizeof(struct pt_data),
-                &testp[0].el   , sizeof(struct pt_data),
-                 testp[0].r    , sizeof(struct pt_data),
-                &testp[0].dist2, sizeof(struct pt_data),
-                x_base, x_stride,
-                NEL*MULD(TN,TN,TN), &fld, buf);
-  findpts_local_free(&fld);
-  print_ptdata();
-}
-
-int main()
-{
-  buffer buf = null_buffer;
-  lobatto_nodes(zr,NR),lobatto_nodes(zs,NS),lobatto_nodes(zt,NT);
-  test(&buf);
-  buffer_free(&buf);
-  return 0;
-}
diff --git a/3rdParty/gslib/tests/findpts_test.c b/3rdParty/gslib/tests/findpts_test.c
deleted file mode 100644
index ad9638228..000000000
--- a/3rdParty/gslib/tests/findpts_test.c
+++ /dev/null
@@ -1,328 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <float.h>
-#include <math.h>
-#include "c99.h"
-#include "name.h"
-#include "fail.h"
-#include "types.h"
-#include "mem.h"
-#include "poly.h"
-#include "gs_defs.h"
-#include "comm.h"
-#include "rand_elt_test.h"
-#include "findpts.h"
-#include "crystal.h"
-#include "sarray_transfer.h"
-
-#define D 3
-
-#if D==3
-#define INITD(a,b,c) {a,b,c}
-#define MULD(a,b,c) ((a)*(b)*(c))
-#define INDEXD(a,na, b,nb, c) (((c)*(nb)+(b))*(na)+(a))
-#define findpts_data  findpts_data_3
-#define findpts_setup findpts_setup_3
-#define findpts_free  findpts_free_3
-#define findpts       findpts_3
-#define findpts_eval  findpts_eval_3
-#elif D==2
-#define INITD(a,b,c) {a,b}
-#define MULD(a,b,c) ((a)*(b))
-#define INDEXD(a,na, b,nb, c) ((b)*(na)+(a))
-#define findpts_data  findpts_data_2
-#define findpts_setup findpts_setup_2
-#define findpts_free  findpts_free_2
-#define findpts       findpts_2
-#define findpts_eval  findpts_eval_2
-#endif
-
-#define NR 5
-#define NS 7
-#define NT 6
-#define K 4
-#define NEL MULD(K,K,K)
-#define TN 4
-
-#define NPT_MAX 256
-#define BBOX_TOL 0.01
-#define NEWT_TOL 1024*DBL_EPSILON
-#define LOC_HASH_SIZE NEL*MULD(NR,NS,NT)
-#define GBL_HASH_SIZE NEL*MULD(NR,NS,NT)
-
-/*
-#define NPT_MAX 256
-#define BBOX_TOL 1.00
-#define NEWT_TOL 1024*DBL_EPSILON
-#define LOC_HASH_SIZE NEL*3
-#define GBL_HASH_SIZE NEL*3
-*/
-
-static uint np, id;
-
-static const unsigned nr[D] = INITD(NR,NS,NT);
-static const unsigned mr[D] = INITD(2*NR,2*NS,2*NT);
-static double zr[NR], zs[NS], zt[NT];
-static double x3[D][MULD(3,3,3)];
-static double mesh[D][NEL*MULD(NR,NS,NT)];
-static const double *const elx[D] = INITD(mesh[0],mesh[1],mesh[2]);
-
-struct pt_data { double x[D], r[D], dist2, ex[D]; uint code, proc, el; };
-static struct array testp;
-
-static struct crystal cr;
-
-static double quad_eval(const double coef[MULD(3,3,3)], const double r[D])
-{
-  double lr0[D], lr1[D], lr2[D];
-  unsigned d;
-  for(d=0;d<D;++d) lr0[d]=r[d]*(r[d]-1)/2,
-                   lr1[d]=(1+r[d])*(1-r[d]),
-                   lr2[d]=r[d]*(r[d]+1)/2;
-  #define EVALR(base) ( coef [base   ]*lr0[0] \
-                       +coef [base+ 1]*lr1[0] \
-                       +coef [base+ 2]*lr2[0] )
-  #define EVALS(base) ( EVALR(base   )*lr0[1] \
-                       +EVALR(base+ 3)*lr1[1] \
-                       +EVALR(base+ 6)*lr2[1] )
-  #define EVALT(base) ( EVALS(base   )*lr0[2] \
-                       +EVALS(base+ 9)*lr1[2] \
-                       +EVALS(base+18)*lr2[2] )
-  #if D==2
-  #  define EVAL() EVALS(0)
-  #elif D==3
-  #  define EVAL() EVALT(0)
-  #endif
-  
-  return EVAL();
-  
-  #undef EVAL
-  #undef EVALT
-  #undef EVALS
-  #undef EVALR
-}
-
-static void rand_mesh(void)
-{
-  const uint pn = ceil(pow(np,1.0/D));
-  const uint pi=id%pn, pj=(id/pn)%pn;
-  #if D==3
-  const uint pk=(id/pn)/pn;
-  #endif
-  const double pfac = 1.0/pn;
-  const double pbase[D] = INITD(-1+2*pfac*pi, -1+2*pfac*pj, -1+2*pfac*pk);
-  const double fac = 1.0/K;
-  const double z3[3] = {-1,0,1};
-  unsigned ki,kj;
-  #if D==3
-  unsigned kk;
-  rand_elt_3(x3[0],x3[1],x3[2], z3,3,z3,3,z3,3);
-  #elif D==2
-  rand_elt_2(x3[0],x3[1],       z3,3,z3,3);
-  #endif
-  if(id==0) printf("Global division: %u^%d\n",(unsigned)pn,D);
-  #if D==3
-  for(kk=0;kk<K;++kk)
-  #endif
-  for(kj=0;kj<K;++kj) for(ki=0;ki<K;++ki) {
-    unsigned off = INDEXD(ki,K, kj,K, kk)*MULD(NR,NS,NT);
-    unsigned i,j;
-    double r[D], base[D] = INITD(-1+2*fac*ki,-1+2*fac*kj,-1+2*fac*kk);
-    #if D==3
-    unsigned k;
-    for(k=0;k<NT;++k) { r[2]=pbase[2]+pfac*(1+base[2]+fac*(1+zt[k]));
-    #endif
-    for(j=0;j<NS;++j) { r[1]=pbase[1]+pfac*(1+base[1]+fac*(1+zs[j]));
-    for(i=0;i<NR;++i) { r[0]=pbase[0]+pfac*(1+base[0]+fac*(1+zr[i]));
-      mesh[0][off+INDEXD(i,NR, j,NS, k)] = quad_eval(x3[0],r);
-      mesh[1][off+INDEXD(i,NR, j,NS, k)] = quad_eval(x3[1],r);
-      #if D==3
-      mesh[2][off+INDEXD(i,NR, j,NS, k)] = quad_eval(x3[2],r);
-    }
-    #endif
-    }}
-  }
-}
-
-static void test_mesh(void)
-{
-  const uint pn = ceil(pow(np,1.0/D));
-  const uint pi=id%pn, pj=(id/pn)%pn;
-  #if D==3
-  const uint pk=(id/pn)/pn;
-  #endif
-  const double pfac = 1.0/pn;
-  const double pbase[D] = INITD(-1+2*pfac*pi, -1+2*pfac*pj, -1+2*pfac*pk);
-  const double fac = 1.0/K, step = 2.0/(K*(TN-1));
-  unsigned ki,kj;
-  #if D==3
-  unsigned kk;
-  #endif
-  struct pt_data *out = testp.ptr;
-  testp.n = NEL*MULD(TN,TN,TN);
-  memset(testp.ptr,0,testp.n*sizeof(struct pt_data));
-  #if D==3
-  for(kk=0;kk<K;++kk)
-  #endif
-  for(kj=0;kj<K;++kj) for(ki=0;ki<K;++ki) {
-    unsigned i,j;
-    double r[D], base[D] = INITD(-1+2*fac*ki,-1+2*fac*kj,-1+2*fac*kk);
-    #if D==3
-    unsigned k;
-    for(k=0;k<TN;++k) { r[2] = pbase[2]+pfac*(1+base[2]+step*k);
-    #endif
-    for(j=0;j<TN;++j) { r[1] = pbase[1]+pfac*(1+base[1]+step*j);
-    for(i=0;i<TN;++i) { r[0] = pbase[0]+pfac*(1+base[0]+step*i);
-      out->proc = rand()%np;
-      out->x[0] = quad_eval(x3[0],r);
-      out->x[1] = quad_eval(x3[1],r);
-      #if D==3
-      out->x[2] = quad_eval(x3[2],r);
-      #endif
-      ++out;
-    }}
-    #if D==3
-    }
-    #endif
-  }
-  sarray_transfer(struct pt_data,&testp,proc,1,&cr);
-  if(0)
-    printf("%u: %u shuffled points\n",id,(unsigned)testp.n);
-}
-
-static void print_ptdata(const struct comm *const comm)
-{
-  uint notfound=0;
-  double dist2max=0, ed2max=0;
-  const struct pt_data *pt = testp.ptr, *const end = pt+testp.n;
-  for(;pt!=end;++pt) {
-    if(0&&id==0)
-    printf("code=%u, proc=%u, el=%u, dist2=%g, r=(%.17g,%.17g"
-           #if D==3
-           ",%.17g"
-           #endif
-           "), "
-           "x=(%.17g,%.17g"
-           #if D==3
-           ",%.17g"
-           #endif
-           "), ex=(%.17g,%.17g"
-           #if D==3
-           ",%.17g"
-           #endif
-           ")\n",
-      pt->code,pt->proc,pt->el,pt->dist2,
-      pt->r[0],pt->r[1],
-      #if D==3
-      pt->r[2],
-      #endif
-      pt->x[0],pt->x[1],
-      #if D==3
-      pt->x[2],
-      #endif
-      pt->ex[0],pt->ex[1]
-      #if D==3
-      ,pt->ex[2]
-      #endif
-      );
-    if(pt->code==2) ++notfound;
-    else {
-      double ed2=0, dx;
-      unsigned d; for(d=0;d<D;++d) dx=pt->x[d]-pt->ex[d], ed2+=dx*dx;
-      dist2max=pt->dist2>dist2max?pt->dist2:dist2max;
-      ed2max=ed2>ed2max?ed2:ed2max;
-    }
-  }
-  {
-    double distmax=sqrt(dist2max), edmax=sqrt(ed2max);
-    slong total=testp.n;
-    if(0)
-    printf("%u: maximum distance = %g (adv), %g (eval);"
-           " %u/%u points not found\n",
-         (unsigned)id, distmax, edmax,
-         (unsigned)notfound, (unsigned)testp.n);
-    distmax = comm_reduce_double(comm,gs_max,&distmax,1);
-    edmax   = comm_reduce_double(comm,gs_max,&edmax  ,1);
-    notfound = comm_reduce_sint(comm,gs_add,(sint*)&notfound,1);
-    total    = comm_reduce_slong(comm,gs_add,&total,1);
-    if(id==0)
-      printf("maximum distance = %g (adv), %g (eval);"
-           " %u/%lu points not found\n",
-           distmax, edmax,
-           (unsigned)notfound, (unsigned long)total);
-  }
-}
-
-static void test(const struct comm *const comm)
-{
-  const double *x_base[D];
-  const unsigned x_stride[D] = INITD(sizeof(struct pt_data),
-                                     sizeof(struct pt_data),
-                                     sizeof(struct pt_data));
-  struct findpts_data *fd;
-  struct pt_data *pt;
-  unsigned d;
-  if(id==0) printf("Initializing mesh\n");
-  rand_mesh();
-  test_mesh();
-  pt = testp.ptr;
-  if(id==0) printf("calling findpts_setup\n");
-  fd=findpts_setup(comm,elx,nr,NEL,mr,BBOX_TOL,
-                   LOC_HASH_SIZE,GBL_HASH_SIZE,
-                   NPT_MAX,NEWT_TOL);
-  if(id==0) printf("calling findpts\n");
-  x_base[0]=pt->x, x_base[1]=pt->x+1;
-  #if D==3
-  x_base[2]=pt->x+2;
-  #endif
-  findpts(&pt->code , sizeof(struct pt_data),
-          &pt->proc , sizeof(struct pt_data),
-          &pt->el   , sizeof(struct pt_data),
-           pt->r    , sizeof(struct pt_data),
-          &pt->dist2, sizeof(struct pt_data),
-           x_base   , x_stride, testp.n, fd);
-  for(d=0;d<D;++d) {
-    if(id==0) printf("calling findpts_eval (%u)\n",d);
-    findpts_eval(&pt->ex[d], sizeof(struct pt_data),
-                 &pt->code , sizeof(struct pt_data),
-                 &pt->proc , sizeof(struct pt_data),
-                 &pt->el   , sizeof(struct pt_data),
-                  pt->r    , sizeof(struct pt_data),
-                  testp.n, mesh[d], fd);
-  }
-  findpts_free(fd);
-  print_ptdata(comm);
-}
-
-int main(int narg, char *arg[])
-{
-  comm_ext world;
-  struct comm comm;
-  
-#ifdef MPI
-  MPI_Init(&narg,&arg);
-  world = MPI_COMM_WORLD;
-#else
-  world=0;
-#endif
-
-  comm_init(&comm,world);
-  id=comm.id, np=comm.np;
-
-  lobatto_nodes(zr,NR),lobatto_nodes(zs,NS),lobatto_nodes(zt,NT);
-  array_init(struct pt_data,&testp,NEL*MULD(TN,TN,TN));
-  crystal_init(&cr,&comm);
-  test(&comm);
-  crystal_free(&cr);
-  array_free(&testp);
-  
-  comm_free(&comm);
-
-#ifdef MPI
-  MPI_Finalize();
-#endif
-
-  return 0;
-}
diff --git a/3rdParty/gslib/tests/fortran/f-igs.f b/3rdParty/gslib/tests/fortran/f-igs.f
deleted file mode 100644
index 6cff51374..000000000
--- a/3rdParty/gslib/tests/fortran/f-igs.f
+++ /dev/null
@@ -1,59 +0,0 @@
-      program figs
-      implicit none
-
-      include 'mpif.h'
-
-      integer npmax
-      parameter(npmax=16)
-
-      integer ierror,handle,hwait,np,me,i,neighbors,count
-      integer*8 id(npmax)
-
-      real*8 answer(npmax),u(npmax)
-
-      call mpi_init(ierror)
-      call mpi_comm_size(mpi_comm_world,np,ierror)
-      call mpi_comm_rank(mpi_comm_world,me,ierror)
-
-      count=1
-      if(me.gt.0) then
-        id(count)=me
-        count=count+1
-      endif
-      id(count)=me+1
-      count=count+1
-      if(me.lt.(np-1)) then
-        id(count)=me+2
-        count=count+1
-      endif
-
-      neighbors=count-1
-!     gs_pairwise
-      call gs_setup_pick(handle,id,neighbors,mpi_comm_world,np,1)
-
-      if(np.eq.1) then
-        answer(1)=1.0
-      else
-        answer(1)=2.0
-        answer(np)=2.0
-        do i=2,np-1
-          answer(i)=3.0
-        enddo
-      endif
-
-      do i=1,neighbors
-        u(i)=1.0
-      enddo
-
-      call igs_op(handle,u,1,1,0,hwait)
-      call gs_op_wait(hwait)
-
-      do i=1,neighbors
-        if(abs(u(i)-answer(id(i)))>1e-16) then
-          write(6,*) 'igs_op test failed'
-        endif
-      enddo
-
-      call mpi_finalize(ierror)
-
-      end
diff --git a/3rdParty/gslib/tests/gs_test.c b/3rdParty/gslib/tests/gs_test.c
deleted file mode 100644
index 1d0e948c7..000000000
--- a/3rdParty/gslib/tests/gs_test.c
+++ /dev/null
@@ -1,133 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <math.h>
-
-#include "c99.h"
-#include "name.h"
-#include "fail.h"
-#include "types.h"
-#include "comm.h"
-#include "mem.h"
-#include "gs_defs.h"
-#include "gs.h"
-
-typedef double T;
-const gs_dom dom = gs_double;
-
-static void test(const struct comm *comm, gs_method method)
-{
-  struct gs_data *gsh;
-  const uint np = comm->np;
-  slong *id = tmalloc(slong,np+4);
-  T *v = tmalloc(T,np+4);
-  uint i;
-  id[0] = -(slong)(np+10+3*comm->id);
-  for(i=0;i<np;++i) id[i+1] = -(sint)(i+1);
-  id[np+1] = comm->id+1;
-  id[np+2] = comm->id+1;
-  id[np+3] = np-comm->id;
-  gsh = gs_setup(id,np+4,comm,0,method,1);
-  free(id);
-
-  /* non-blocking api - original test */
-  if(comm->id==0) printf("\nTesting non-blocking api ...\n");
-  for(i=0;i<np+4;++i) v[i] = 1;
-  int handle;
-  igs(v,dom,gs_add,0,gsh,0,&handle);
-  gs_wait (handle);
-  if(comm->id==0) for(i=0;i<np+4;++i) printf("%g\n",v[i]);
-  if(comm->id==0) printf("\n");
-
-  for(i=0;i<np+4;++i) v[i] = 1;
-  igs(v,dom,gs_add,1,gsh,0,&handle);
-  gs_wait (handle);
-  if(comm->id==0) for(i=0;i<np+4;++i) printf("%g\n",v[i]);
-
-  /* blocking api - original test */
-  if(comm->id==0) printf("\nTesting blocking api ...\n");
-  for(i=0;i<np+4;++i) v[i] = 1;
-  gs(v,dom,gs_add,0,gsh,0);
-  if(comm->id==0) for(i=0;i<np+4;++i) printf("%g\n",v[i]);
-  if(comm->id==0) printf("\n");
-
-  for(i=0;i<np+4;++i) v[i] = 1;
-  gs(v,dom,gs_add,1,gsh,0);
-  if(comm->id==0) for(i=0;i<np+4;++i) printf("%g\n",v[i]);
-  gs_free(gsh);
-  free(v);
-
-  /* non-blocking gs, gs_many and gs_vec */
-  uint neighbors = 3; //max neighbors, counting ownself
-  slong id1[neighbors];
-  slong me = (slong)comm->id;
-  uint count=0;
-  if(me>0) id1[count++]=me;
-  id1[count++]=me+1;
-  if(me<np-1) id1[count++]=me+2;
-  neighbors=count;
-
-  struct gs_data *gsh1;
-  gsh1 = gs_setup(id1,neighbors,comm,0,method,0);
-
-  T answer[np];
-  if   (np==1) answer[0]=1.0;
-  else {
-    answer[0]=answer[np-1]=2.0;
-    for(i=1;i<np-1;i++) answer[i]=3.0;
-  }
-
-  T u[neighbors];
-  for(i=0;i<neighbors;i++) u[i]=1.0;
-
-  igs(u,dom,gs_add,0,gsh,0,&handle);
-  gs_wait(handle);
-
-  for(i=0;i<neighbors;i++) assert(fabs(u[i]-answer[id1[i]-1])<1e-16);
-
-  T w[neighbors][2];
-  for(i=0;i<neighbors;i++) w[i][0]=1.0,w[i][1]=2.0;
-  igs_vec(w,2,dom,gs_add,0,gsh,0,&handle);
-  gs_wait(handle);
-
-  for(i=0;i<neighbors;i++) assert(fabs(w[i][0]-answer[id1[i]-1])<1e-16);
-  for(i=0;i<neighbors;i++) assert(fabs(w[i][1]-2*answer[id1[i]-1])<1e-16);
-
-  T x1[neighbors], x2[neighbors];
-  for(i=0;i<neighbors;i++) x1[i]=1.0,x2[i]=2.0;
-  T *x[2] = {x1, x2};
-  igs_many((void*)x,2,dom,gs_add,0,gsh,0,&handle);
-  gs_wait(handle);
-
-  for(i=0;i<neighbors;i++) assert(fabs(x1[i]-answer[id1[i]-1])<1e-16);
-  for(i=0;i<neighbors;i++) assert(fabs(x2[i]-2*answer[id1[i]-1])<1e-16);
-}
-
-int main(int narg, char *arg[])
-{
-  comm_ext world; int np;
-  struct comm comm;
-
-#ifdef MPI
-  MPI_Init(&narg,&arg);
-  world = MPI_COMM_WORLD;
-  MPI_Comm_size(world,&np);
-#else
-  world=0, np=1;
-#endif
-
-  comm_init(&comm,world);
-
-  test(&comm,gs_all_reduce);
-  test(&comm,gs_pairwise);
-
-  comm_free(&comm);
-
-#ifdef MPI
-  MPI_Finalize();
-#endif
-
-  return 0;
-}
diff --git a/3rdParty/gslib/tests/gs_test_gop_blocking.c b/3rdParty/gslib/tests/gs_test_gop_blocking.c
deleted file mode 100644
index 237ac7f57..000000000
--- a/3rdParty/gslib/tests/gs_test_gop_blocking.c
+++ /dev/null
@@ -1,107 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include "c99.h"
-#include "name.h"
-#include "fail.h"
-#include "types.h"
-#include "comm.h"
-#include "mem.h"
-#include "gs_defs.h"
-#include "gs.h"
-
-struct gs_data *gop_handle;
-int np;
-
-//------------------------------------------------------------------------------
-void gop_init(struct comm *gop_comm, comm_ext world) {
-  comm_init(gop_comm, world);
-
-  const long long gop_id = 1;
-
-  gop_handle = gs_setup(&gop_id, 1, gop_comm, 0, gs_auto, 0);
-}
-//------------------------------------------------------------------------------
-void gop(void *u, gs_dom dom, gs_op op, unsigned transpose) {
-  gs(u, dom, op, transpose, gop_handle, NULL);
-}
-//------------------------------------------------------------------------------
-void gop_free(struct comm* gop_comm) {
-  comm_free(gop_comm);
-
-  gs_free(gop_handle);
-}
-//------------------------------------------------------------------------------
-int test_min(int rank) {
-  int min = rank;
-  gop(&min, gs_int, gs_min, 0);
-
-  if (rank == 0) printf("\ngop min test: ");
-  if (min == 0) {
-    if (rank == 0) printf("[Passed]");
-    return 0;
-  } else {
-    if (rank == 0) printf("[Failed]");
-    return 1;
-  }
-}
-//------------------------------------------------------------------------------
-int test_max(int rank) {
-  int max = rank;
-  gop(&max, gs_int, gs_max, 0);
-
-  if (rank == 0) printf("\ngop max test: ");
-  if (max == np-1) {
-    if (rank == 0) printf("[Passed]");
-    return 0;
-  } else {
-    if (rank == 0) printf("[Failed]");
-    return 1;
-  }
-}
-//------------------------------------------------------------------------------
-int test_add(int rank) {
-  int sum = rank;
-  gop(&sum, gs_int, gs_add, 0);
-  sum *= 2;
-
-  if (rank == 0) printf("\ngop add test: ");
-  if (sum == np*(np-1)) {
-    if (rank == 0) printf("[Passed]");
-    return 0;
-  } else {
-    if (rank == 0) printf("[Failed]");
-    return 1;
-  }
-}
-//------------------------------------------------------------------------------
-int main(int narg, char *arg[])
-{
-  comm_ext world; int rank, result;
-  struct comm comm;
-
-#ifdef MPI
-  MPI_Init(&narg,&arg);
-  world = MPI_COMM_WORLD;
-  MPI_Comm_size(world,&np);
-  MPI_Comm_rank(world,&rank);
-#else
-  world=0, np=1; rank = 0;
-#endif
-
-  gop_init(&comm,world);
-
-  result  = test_min(rank);
-  result += test_max(rank);
-  result += test_add(rank);
-
-  gop_free(&comm);
-
-#ifdef MPI
-  MPI_Finalize();
-#endif
-
-  return result;
-}
diff --git a/3rdParty/gslib/tests/gs_test_gop_nonblocking.c b/3rdParty/gslib/tests/gs_test_gop_nonblocking.c
deleted file mode 100644
index b1c2a7057..000000000
--- a/3rdParty/gslib/tests/gs_test_gop_nonblocking.c
+++ /dev/null
@@ -1,131 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include "c99.h"
-#include "name.h"
-#include "fail.h"
-#include "types.h"
-#include "comm.h"
-#include "mem.h"
-#include "gs_defs.h"
-#include "gs.h"
-
-struct gs_data *gop_handle;
-int np;
-
-//------------------------------------------------------------------------------
-void gop_init(struct comm *gop_comm, comm_ext world) {
-  comm_init(gop_comm, world);
-
-  const long long gop_id = 1;
-
-  gop_handle = gs_setup(&gop_id, 1, gop_comm, 0, gs_pairwise, 0);
-}
-//------------------------------------------------------------------------------
-void igop(void *u, gs_dom dom, gs_op op, unsigned transpose) {
-  // In a real case, these calls will be split across other code
-  int handle;
-  igs(u, dom, op, transpose, gop_handle, NULL, &handle);
-  gs_wait (handle);
-}
-//------------------------------------------------------------------------------
-void gop_free(struct comm* gop_comm) {
-  comm_free(gop_comm);
-
-  gs_free(gop_handle);
-}
-//------------------------------------------------------------------------------
-int test_imin(int rank) {
-  int min = rank;
-  igop(&min, gs_int, gs_min, 0);
-
-  if (rank == 0) printf("\ngop min test: ");
-  if (min == 0) {
-    if (rank == 0) printf("[Passed]");
-    return 0;
-  } else {
-    if (rank == 0) printf("[Failed]");
-    return 1;
-  }
-}
-//------------------------------------------------------------------------------
-int test_imax(int rank) {
-  int max = rank;
-  igop(&max, gs_int, gs_max, 0);
-
-  if (rank == 0) printf("\ngop max test: ");
-  if (max == np-1) {
-    if (rank == 0) printf("[Passed]");
-    return 0;
-  } else {
-    if (rank == 0) printf("[Failed]");
-    return 1;
-  }
-}
-//------------------------------------------------------------------------------
-int test_iadd(int rank) {
-  int sum = rank;
-  igop(&sum, gs_int, gs_add, 0);
-  sum *= 2;
-
-  if (rank == 0) printf("\ngop add test: ");
-  if (sum == np*(np-1)) {
-    if (rank == 0) printf("[Passed]");
-    return 0;
-  } else {
-    if (rank == 0) printf("[Failed]");
-    return 1;
-  }
-}
-//------------------------------------------------------------------------------
-int test_imul(int rank) {
-  int mul = rank + 1;
-  igop(&mul, gs_int, gs_mul, 0);
-
-  int answer=1, i;
-  for(i = 2; i <= np; i++) {
-    answer*=i;
-  }
-  if (rank == 0) printf("\ngop mul test: ");
-  if (mul == answer) {
-    if (rank == 0) printf("[Passed]");
-    return 0;
-  } else {
-    if (rank == 0) printf("[Failed]");
-    return 1;
-  }
-}
-//------------------------------------------------------------------------------
-int main(int narg, char *arg[])
-{
-  comm_ext world; int rank, result;
-  struct comm comm;
-
-#ifdef MPI
-  MPI_Init(&narg,&arg);
-  world = MPI_COMM_WORLD;
-  MPI_Comm_size(world,&np);
-  MPI_Comm_rank(world,&rank);
-#else
-  world=0, np=1; rank = 0;
-#endif
-
-  gop_init(&comm,world);
-
-  result  = test_imin(rank);
-  result += test_imax(rank);
-  result += test_iadd(rank);
-  result += test_imul(rank);
-
-  gop_free(&comm);
-
-  if (rank == 0) printf("\n");
-
-#ifdef MPI
-  MPI_Finalize();
-#endif
-
-  return result;
-}
diff --git a/3rdParty/gslib/tests/gs_test_old.c b/3rdParty/gslib/tests/gs_test_old.c
deleted file mode 100644
index f6143333e..000000000
--- a/3rdParty/gslib/tests/gs_test_old.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/* simple stand-alone test for parallel gather-scatter routines
-   assumes gather-scatter routines were compiled with default names
-   can compile to sequential version if MPI is not defined
-   
-   the test is as follows, where N is the number of procs:
-     there are N physical nodes (vertices)
-     each proc has 2 local/virtual nodes mapping to each physical node,
-       for a total of 2*N*N virtual nodes
-     virtual nodes are given values that correspond to a sequential ordering
-       (so that they range from 0 to 2*N*N-1)
-       the addition operation is performed and the result is checked,
-       the correct result being known a priori
-     the addition operation is also checked, in a similar manner, for
-       both the cpgs_op_vec and cpgs_op_many routines with vector dimension 3
-*/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#ifdef MPI
-#  include <mpi.h>
-#else
-   typedef void MPI_Comm;
-#endif
-#include "name.h"
-#include "types.h"
-
-typedef long real;
-sint datatype = 3;
-
-#define fgs_setup     FORTRAN_NAME(gs_setup    ,GS_SETUP    )
-#define fgs_op        FORTRAN_NAME(gs_op       ,GS_OP       )
-#define fgs_op_vec    FORTRAN_NAME(gs_op_vec   ,GS_OP_VEC   )
-#define fgs_op_many   FORTRAN_NAME(gs_op_many  ,GS_OP_MANY  )
-#define fgs_op_fields FORTRAN_NAME(gs_op_fields,GS_OP_FIELDS)
-#define fgs_free      FORTRAN_NAME(gs_free     ,GS_FREE     )
-
-void fgs_setup(sint *handle, const slong id[], const sint *n,
-               const MPI_Fint *comm, const sint *np);
-void fgs_op(const sint *handle, void *u, const sint *dom, const sint *op,
-            const sint *transpose);
-void fgs_op_vec(const sint *handle, void *u, const sint *n,
-                const sint *dom, const sint *op, const sint *transpose);
-void fgs_op_many(const sint *handle, void *u1, void *u2, void *u3,
-                 void *u4, void *u5, void *u6, const sint *n,
-                 const sint *dom, const sint *op, const sint *transpose);
-void fgs_free(const sint *handle);
-
-void assert_is_zero(real v)
-{
-  if(fabs(v) < 1e-20) return;
-  printf("test failed\n");
-  exit(1);
-}
-
-int main(int narg, char* arg[])
-{
-  sint transpose=0;
-  sint id=0,np=1;
-  sint i,handle,maxv=3;
-  real *u;
-  slong *glindex;
-#ifndef MPI
-  int comm;
-#else
-  MPI_Init(&narg,&arg);
-  MPI_Comm comm;
-  MPI_Comm_dup(MPI_COMM_WORLD,&comm);
-  MPI_Fint fcomm = MPI_Comm_c2f(comm);
-  { int i;
-    MPI_Comm_rank(comm,&i); id=i;
-    MPI_Comm_size(comm,&i); np=i;
-  }
-#endif
-
-  glindex = malloc(np*2*sizeof(slong));
-  for(i=0;i<np;++i) glindex[2*i+1] = glindex[2*i] = i+1;
-  i=np*2;
-  fgs_setup(&handle,glindex,&i,&fcomm,&np);
-  free(glindex);
-  
-  u = malloc(np*2*sizeof(real));
-  for(i=0;i<np;++i) u[2*i  ] = (real)( 2*np*id + 2*i ),
-                    u[2*i+1] = (real)( 2*np*id + 2*i+1 );
-  /*for(i=0;i<np;++i) printf(" (%g %g)", u[2*i], u[2*i+1]); printf("\n");*/
-  i=1, fgs_op(&handle,u,&datatype,&i,&transpose);
-  /*for(i=0;i<np;++i) printf(" (%g %g)", u[2*i], u[2*i+1]); printf("\n");*/
-  for(i=0;i<np;++i) assert_is_zero( np*(2*np*(np-1)+4*i+1) - u[2*i] ),
-                    assert_is_zero( np*(2*np*(np-1)+4*i+1) - u[2*i+1]  );
-  free(u);
-
-  u = malloc(np*2*3*sizeof(real));
-  for(i=0;i<np;++i)
-    u[3*(2*i  )+0] = (real)( 3*(2*np*id + 2*i  ) + 0 ),
-    u[3*(2*i  )+1] = (real)( 3*(2*np*id + 2*i  ) + 1 ),
-    u[3*(2*i  )+2] = (real)( 3*(2*np*id + 2*i  ) + 2 ),
-    u[3*(2*i+1)+0] = (real)( 3*(2*np*id + 2*i+1) + 0 ),
-    u[3*(2*i+1)+1] = (real)( 3*(2*np*id + 2*i+1) + 1 ),
-    u[3*(2*i+1)+2] = (real)( 3*(2*np*id + 2*i+1) + 2 );
-  /*for(i=0;i<np;++i) {
-    int j;
-    printf("%d: ( ", id);
-    for(j=3*(2*i);j<=3*(2*i+1)+2;++j) printf("%g ",u[j]);
-    printf(")\n");
-  }*/
-  i=1, maxv=3, fgs_op_vec(&handle,u,&maxv,&datatype,&i,&transpose);
-  /*for(i=0;i<np;++i) {
-    int j;
-    printf("%d: ( ", id);
-    for(j=3*(2*i);j<=3*(2*i+1)+2;++j) printf("%g ",u[j]);
-    printf(")\n");
-  }*/
-  for(i=0;i<np;++i)
-    assert_is_zero( np*(6*np*(np-1)+12*i+3+2*0) - u[3*(2*i  )+0] ),
-    assert_is_zero( np*(6*np*(np-1)+12*i+3+2*1) - u[3*(2*i  )+1] ),
-    assert_is_zero( np*(6*np*(np-1)+12*i+3+2*2) - u[3*(2*i  )+2] ),
-    assert_is_zero( np*(6*np*(np-1)+12*i+3+2*0) - u[3*(2*i+1)+0] ),
-    assert_is_zero( np*(6*np*(np-1)+12*i+3+2*1) - u[3*(2*i+1)+1] ),
-    assert_is_zero( np*(6*np*(np-1)+12*i+3+2*2) - u[3*(2*i+1)+2] );
-  free(u);
-
-  u = malloc(np*2*3*sizeof(real));
-  for(i=0;i<np;++i)
-    u[2*np*0+(2*i  )] = (real)( 3*(2*np*id + 2*i  ) + 0 ),
-    u[2*np*1+(2*i  )] = (real)( 3*(2*np*id + 2*i  ) + 1 ),
-    u[2*np*2+(2*i  )] = (real)( 3*(2*np*id + 2*i  ) + 2 ),
-    u[2*np*0+(2*i+1)] = (real)( 3*(2*np*id + 2*i+1) + 0 ),
-    u[2*np*1+(2*i+1)] = (real)( 3*(2*np*id + 2*i+1) + 1 ),
-    u[2*np*2+(2*i+1)] = (real)( 3*(2*np*id + 2*i+1) + 2 );
-  i=1, maxv=3, fgs_op_many(&handle,u,u+2*np,u+4*np,0,0,0,&maxv,
-                           &datatype,&i,&transpose);
-  for(i=0;i<np;++i)
-    assert_is_zero( np*(6*np*(np-1)+12*i+3+2*0) - u[2*np*0+(2*i  )] ),
-    assert_is_zero( np*(6*np*(np-1)+12*i+3+2*1) - u[2*np*1+(2*i  )] ),
-    assert_is_zero( np*(6*np*(np-1)+12*i+3+2*2) - u[2*np*2+(2*i  )] ),
-    assert_is_zero( np*(6*np*(np-1)+12*i+3+2*0) - u[2*np*0+(2*i+1)] ),
-    assert_is_zero( np*(6*np*(np-1)+12*i+3+2*1) - u[2*np*1+(2*i+1)] ),
-    assert_is_zero( np*(6*np*(np-1)+12*i+3+2*2) - u[2*np*2+(2*i+1)] );
-  free(u);
-  
-  fgs_free(&handle);
-  printf("test on node %d/%d succeeded\n", (int)id+1, (int)np);
-#ifdef MPI  
-  MPI_Comm_free(&comm);
-  MPI_Finalize();
-#endif
-  return 0;
-}
diff --git a/3rdParty/gslib/tests/gs_unique_test.c b/3rdParty/gslib/tests/gs_unique_test.c
deleted file mode 100644
index e87199e9a..000000000
--- a/3rdParty/gslib/tests/gs_unique_test.c
+++ /dev/null
@@ -1,87 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include "c99.h"
-#include "name.h"
-#include "fail.h"
-#include "types.h"
-#include "comm.h"
-#include "mem.h"
-#include "gs_defs.h"
-#include "gs.h"
-
-static void test(const struct comm *comm)
-{
-  uint i,np=comm->np,id=comm->id;
-  slong *glindex = tmalloc(slong,np*2);
-  char *out, *buf = tmalloc(char,80+np*2*30);
-  struct gs_data *gsh;
-  
-  for(i=0;i<np;++i) glindex[2*i+1]=glindex[2*i]=i+1;
-  
-  out = buf+sprintf(buf, "%03d bgn : [", (int)comm->id);
-  for(i=0;i<np*2;++i) out += sprintf(out, " %+d", (int)glindex[i]);
-  sprintf(out," ]"), puts(buf);
-  
-  gs_unique(glindex,np*2,comm);
-
-  out = buf+sprintf(buf, "%03d end : [", (int)comm->id);
-  for(i=0;i<np*2;++i) out += sprintf(out, " %+d", (int)glindex[i]);
-  sprintf(out," ]"), puts(buf);
-
-  /* non-blocking api */
-  if(comm->id==0) printf("\nTesting non-blocking api ...\n");
-  for(i=0;i<np;++i) glindex[2*i+1]=glindex[2*i]=i+1;
-  gsh=gs_setup(glindex,np*2,comm,1,gs_all_reduce,1);
-  for(i=0;i<np;++i) glindex[2*i+1]=glindex[2*i]=id;
-  int handle;
-  igs(glindex,gs_slong,gs_add,0,gsh,0,&handle);
-  gs_wait(handle);
-  gs_free(gsh);
-
-  out = buf+sprintf(buf, "%03d own : [", (int)comm->id);
-  for(i=0;i<np*2;++i) out += sprintf(out, " %+d", (int)glindex[i]);
-  sprintf(out," ]"), puts(buf);
-
-  /* blocking api */
-  if(comm->id==0) printf("\nTesting blocking api ...\n");
-  for(i=0;i<np;++i) glindex[2*i+1]=glindex[2*i]=i+1;
-  gsh=gs_setup(glindex,np*2,comm,1,gs_auto,1);
-  for(i=0;i<np;++i) glindex[2*i+1]=glindex[2*i]=id;
-  gs(glindex,gs_slong,gs_add,0,gsh,0);
-  gs_free(gsh);
-
-  out = buf+sprintf(buf, "%03d own : [", (int)comm->id);
-  for(i=0;i<np*2;++i) out += sprintf(out, " %+d", (int)glindex[i]);
-  sprintf(out," ]"), puts(buf);
-
-  free(buf);
-  free(glindex);
-}
-
-int main(int narg, char *arg[])
-{
-  comm_ext world; int np;
-  struct comm comm;
-  
-#ifdef MPI
-  MPI_Init(&narg,&arg);
-  world = MPI_COMM_WORLD;
-  MPI_Comm_size(world,&np);
-#else
-  world=0, np=1;
-#endif
-
-  comm_init(&comm,world);
-
-  test(&comm);
-  
-  comm_free(&comm);
-
-#ifdef MPI
-  MPI_Finalize();
-#endif
-
-  return 0;
-}
diff --git a/3rdParty/gslib/tests/lob_bnd_test.c b/3rdParty/gslib/tests/lob_bnd_test.c
deleted file mode 100644
index dc141c400..000000000
--- a/3rdParty/gslib/tests/lob_bnd_test.c
+++ /dev/null
@@ -1,185 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <string.h>
-#include "c99.h"
-#include "types.h"
-#include "name.h"
-#include "fail.h"
-#include "mem.h"
-#include "tensor.h"
-#include "poly.h"
-#include "lob_bnd.h"
-
-
-#define RESFAC 4
-#define N  12
-#define NY 9
-#define NZ 4
-#define REPEAT 1000000
-
-#define PI 3.1415926535897932384626433832795028841971693993751058209749445923
-
-
-int main()
-{
-  int failure=0;
-  uint i,r;
-  double p[NZ*NY*N];
-  double lb[2*(RESFAC*NZ)*(RESFAC*NY)*(RESFAC*N)];
-  double work[2*(RESFAC*N)*(RESFAC*NY)*(NZ+1)];
-
-  double *ld_N = tmalloc(double,N+gll_lag_size(N));
-  lagrange_fun *lag_N = gll_lag_setup(ld_N+N,N);
-
-  double *ld_NY= tmalloc(double,NY+gll_lag_size(NY));
-  lagrange_fun *lag_NY = gll_lag_setup(ld_NY+NY,NY);
-
-  double *ld_NZ= tmalloc(double,NZ+gll_lag_size(NZ));
-  lagrange_fun *lag_NZ = gll_lag_setup(ld_NZ+NZ,NZ);
-  
-  double *lb_N  = tmalloc(double,lob_bnd_size(N ,RESFAC*N ));
-  double *lb_NY = tmalloc(double,lob_bnd_size(NY,RESFAC*NY));
-  double *lb_NZ = tmalloc(double,lob_bnd_size(NZ,RESFAC*NZ));
-  lob_bnd_setup(lb_N , N ,RESFAC*N );
-  lob_bnd_setup(lb_NY, NY,RESFAC*NY );
-  lob_bnd_setup(lb_NZ, NZ,RESFAC*NZ );
-  /*for(i=0;i<NY*N;++i) p[i]=rand()/(double)RAND_MAX;
-  
-  lob_bnd_lin_1(lb, lb_N,N,RESFAC*N, p,NY); */
-  
-  /* 1D */
-  for(r=0;r<REPEAT;++r) {
-    int m = RESFAC*N;
-    double x = (rand()/(double)RAND_MAX)*2-1;
-    /* x = cos((m-1-j)*PI/(m-1)) */
-    int j = -1 + m - 1 - (int) (acos(x) * (m-1) / PI);
-    double f = (x - cos((m-1-j)*PI/(m-1))) /
-               (cos((m-1-(j+1))*PI/(m-1)) - cos((m-1-j)*PI/(m-1)));
-
-    if(r%256==0) {
-      for(i=0;i<NY*N;++i) p[i]=rand()/(double)RAND_MAX;
-  
-      lob_bnd_lin_1(lb, lb_N,N,RESFAC*N, p,NY);
-    }
-
-    if(r<3)
-      printf("%g <= %g <= %g,   f = %g\n",
-        cos((m-1-j)*PI/(m-1)), x, cos((m-1-(j+1))*PI/(m-1)), f);
-    lag_N(ld_N,ld_N+N,N,0,x);
-    for(i=0;i<NY;++i) {
-      double lo = (1-f)*lb[(i*m+j)*2  ] + f*lb[(i*m+j+1)*2  ],
-             up = (1-f)*lb[(i*m+j)*2+1] + f*lb[(i*m+j+1)*2+1],
-             px = tensor_dot(ld_N,p+i*N,N);
-      if(r<3 || px < lo || up < px)
-        printf("p_%02d(%g) = %g in [%g,%g]\n",i,x,px,lo,up);
-      if(px<lo || up<px) {failure=1; break;}
-    }
-    if(i!=NY) break;
-  }
-
-  /* x = cos((m-1-j)*PI/(m-1)) */
-  #define GET_JF(x) \
-    int j##x = -1 + m##x - 1 - (int) (acos(x) * (m##x-1) / PI); \
-    double f##x = (x - cos((m##x-1-j##x)*PI/(m##x-1))) / \
-                (cos((m##x-1-(j##x+1))*PI/(m##x-1)) \
-                 - cos((m##x-1-j##x)*PI/(m##x-1)))
-
-  /* 2D */
-  for(r=0;r<REPEAT;++r) {
-    int mx = RESFAC*N, my = RESFAC*NY;
-    double x = (rand()/(double)RAND_MAX)*2-1,
-           y = (rand()/(double)RAND_MAX)*2-1;
-    GET_JF(x); GET_JF(y);
-
-    if(r%256==0) {
-      for(i=0;i<NZ*NY*N;++i) p[i]=rand()/(double)RAND_MAX;
-  
-      lob_bnd_lin_2(lb, lb_N,N,mx, lb_NY,NY,my, p,NZ, work);
-    }
-
-    if(r<3)
-      printf("x: %g <= %g <= %g,   f = %g\n",
-        cos((mx-1-jx)*PI/(mx-1)), x, cos((mx-1-(jx+1))*PI/(mx-1)), fx),
-      printf("y: %g <= %g <= %g,   f = %g\n",
-        cos((my-1-jy)*PI/(my-1)), y, cos((my-1-(jy+1))*PI/(my-1)), fy);
-    lag_N (ld_N ,ld_N +N ,N ,0,x);
-    lag_NY(ld_NY,ld_NY+NY,NY,0,y);
-
-    for(i=0;i<NZ;++i) {
-      double lo = (1-fx)*(1-fy)*lb[((i*mx+jx  )*my+jy  )*2  ]
-                +    fx *(1-fy)*lb[((i*mx+jx+1)*my+jy  )*2  ]
-                + (1-fx)*   fy *lb[((i*mx+jx  )*my+jy+1)*2  ]
-                +    fx *   fy *lb[((i*mx+jx+1)*my+jy+1)*2  ],
-             up = (1-fx)*(1-fy)*lb[((i*mx+jx  )*my+jy  )*2+1]
-                +    fx *(1-fy)*lb[((i*mx+jx+1)*my+jy  )*2+1]
-                + (1-fx)*   fy *lb[((i*mx+jx  )*my+jy+1)*2+1]
-                +    fx *   fy *lb[((i*mx+jx+1)*my+jy+1)*2+1],
-             pxy = tensor_i2(ld_N,N, ld_NY,NY, p+i*N*NY, work);
-      if(r<3 || pxy < lo || up < pxy)
-        printf("p_%02d(%g,%g) = %g in [%g,%g]\n",i,x,y,pxy,lo,up);
-      if(pxy<lo || up<pxy) {failure=1; break;}
-    }
-    if(i!=NZ) break;
-
-  }
-
-  /* 3D */
-  for(r=0;r<REPEAT;++r) {
-    int mx = RESFAC*N, my = RESFAC*NY, mz = RESFAC*NZ;
-    double x = (rand()/(double)RAND_MAX)*2-1,
-           y = (rand()/(double)RAND_MAX)*2-1,
-           z = (rand()/(double)RAND_MAX)*2-1;
-    GET_JF(x); GET_JF(y); GET_JF(z);
-    if(r%256==0) {
-      for(i=0;i<NZ*NY*N;++i) p[i]=rand()/(double)RAND_MAX;
-  
-      lob_bnd_lin_3(lb, lb_N,N,mx, lb_NY,NY,my, lb_NZ,NZ,mz, p,1, work);
-    }
-
-    if(r<3)
-      printf("x: %g <= %g <= %g,   f = %g\n",
-        cos((mx-1-jx)*PI/(mx-1)), x, cos((mx-1-(jx+1))*PI/(mx-1)), fx),
-      printf("y: %g <= %g <= %g,   f = %g\n",
-        cos((my-1-jy)*PI/(my-1)), y, cos((my-1-(jy+1))*PI/(my-1)), fy),
-      printf("z: %g <= %g <= %g,   f = %g\n",
-        cos((mz-1-jz)*PI/(mz-1)), z, cos((mz-1-(jz+1))*PI/(mz-1)), fz);
-    lag_N (ld_N ,ld_N +N ,N ,0,x);
-    lag_NY(ld_NY,ld_NY+NY,NY,0,y);
-    lag_NZ(ld_NZ,ld_NZ+NZ,NZ,0,z);
-
-    {
-      double lo = 
-                + (1-fx)*(1-fy)*(1-fz)*lb[(((jx  )*my+jy  )*mz+jz  )*2  ]
-                +    fx *(1-fy)*(1-fz)*lb[(((jx+1)*my+jy  )*mz+jz  )*2  ]
-                + (1-fx)*   fy *(1-fz)*lb[(((jx  )*my+jy+1)*mz+jz  )*2  ]
-                +    fx *   fy *(1-fz)*lb[(((jx+1)*my+jy+1)*mz+jz  )*2  ]
-                + (1-fx)*(1-fy)*   fz *lb[(((jx  )*my+jy  )*mz+jz+1)*2  ]
-                +    fx *(1-fy)*   fz *lb[(((jx+1)*my+jy  )*mz+jz+1)*2  ]
-                + (1-fx)*   fy *   fz *lb[(((jx  )*my+jy+1)*mz+jz+1)*2  ]
-                +    fx *   fy *   fz *lb[(((jx+1)*my+jy+1)*mz+jz+1)*2  ],
-             up =
-                + (1-fx)*(1-fy)*(1-fz)*lb[(((jx  )*my+jy  )*mz+jz  )*2+1]
-                +    fx *(1-fy)*(1-fz)*lb[(((jx+1)*my+jy  )*mz+jz  )*2+1]
-                + (1-fx)*   fy *(1-fz)*lb[(((jx  )*my+jy+1)*mz+jz  )*2+1]
-                +    fx *   fy *(1-fz)*lb[(((jx+1)*my+jy+1)*mz+jz  )*2+1]
-                + (1-fx)*(1-fy)*   fz *lb[(((jx  )*my+jy  )*mz+jz+1)*2+1]
-                +    fx *(1-fy)*   fz *lb[(((jx+1)*my+jy  )*mz+jz+1)*2+1]
-                + (1-fx)*   fy *   fz *lb[(((jx  )*my+jy+1)*mz+jz+1)*2+1]
-                +    fx *   fy *   fz *lb[(((jx+1)*my+jy+1)*mz+jz+1)*2+1],
-             pxyz = tensor_i3(ld_N,N, ld_NY,NY, ld_NZ,NZ, p, work);
-      if(r<3 || pxyz < lo || up < pxyz)
-        printf("p(%g,%g,%g) = %g in [%g,%g]\n",x,y,z,pxyz,lo,up);
-      if(pxyz<lo || up<pxyz) failure=1;
-    }
-    if(failure) break;
-
-  }
-  
-  free(lb_NZ), free(lb_NY), free(lb_N), free(ld_NZ), free(ld_NY), free(ld_N);
-  
-  printf("Tests %s\n", failure?"failed":"successful");
-
-  return failure;
-}
diff --git a/3rdParty/gslib/tests/obbox_test.c b/3rdParty/gslib/tests/obbox_test.c
deleted file mode 100644
index bec5bb6a0..000000000
--- a/3rdParty/gslib/tests/obbox_test.c
+++ /dev/null
@@ -1,207 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <float.h>
-#include <string.h>
-#include "c99.h"
-#include "types.h"
-#include "name.h"
-#include "fail.h"
-#include "mem.h"
-#include "poly.h"
-#include "lob_bnd.h"
-#include "obbox.h"
-#include "rand_elt_test.h"
-
-#define REPEAT 20
-
-#define N 100
-#define NR 7
-#define MR (4*NR)
-#define NS 8
-#define MS (4*NS)
-#define NT 9
-#define MT (4*NT)
-
-#define TOL 0.00001
-
-static const unsigned nr[3]={NR,NS,NT}, mr[3]={MR,MS,MT};
-
-static double zr[NR], zs[NS], zt[NT];
-static double x[NR*NS*NT*N], y[NR*NS*NT*N], z[NR*NS*NT*N];
-static double tx[3][NR*NS*NT];
-static const double *const elx[3]={x,y,z};
-
-static struct obbox_2 ob2[N*NT];
-static struct obbox_3 ob3[N];
-
-static struct dbl_range dbl_range_expand(struct dbl_range b, double tol)
-{
-  double a = (b.min+b.max)/2, l = (b.max-b.min)*(1+tol)/2;
-  struct dbl_range m;
-  m.min = a-l, m.max = a+l;
-  return m;
-}
-
-int main()
-{
-  int failure=0;
-  unsigned i;
-  
-  double *lob_bnd_data_r = tmalloc(double,
-      lob_bnd_size(NR,MR)+lob_bnd_size(NS,MS)+lob_bnd_size(NT,MT)),
-    *lob_bnd_data_s = lob_bnd_data_r + lob_bnd_size(NR,MR),
-    *lob_bnd_data_t = lob_bnd_data_s + lob_bnd_size(NS,MS);
-
-  lobatto_nodes(zr,NR); lob_bnd_setup(lob_bnd_data_r,NR,MR);
-  lobatto_nodes(zs,NS); lob_bnd_setup(lob_bnd_data_s,NS,MS);
-  lobatto_nodes(zt,NT); lob_bnd_setup(lob_bnd_data_t,NT,MT);
-
-  /* 2-D */
-  for(i=0;i<REPEAT;++i) {
-    unsigned n; double *x_ = x, *y_ = y;
-    for(n=0;n<N && n<6;++n, x_+=NR*NS*NT, y_+=NR*NS*NT)
-      bubble_elt(x_,y_,z, zr,NR, zs,NS, zt,NT, n);
-    for(n=N-6;n;--n, x_+=NR*NS*NT, y_+=NR*NS*NT)
-      rand_elt_2(x_,y_, zr,NR, zs,NS);
-    obbox_calc_2(ob2, elx, nr,NT*N, mr, TOL);
-    x_=x, y_=y;
-    for(n=0;n<N*NT;++n, x_+=NR*NS, y_+=NR*NS) {
-      const struct obbox_2 *ob = &ob2[n];
-      struct dbl_range xr,yr, tr[2];
-      static double work[2*MR*(NS+MS+1)];
-      unsigned j;
-      for(j=0;j<NR*NS;++j) {
-        const double dx=x_[j]-ob->c0[0], dy=y_[j]-ob->c0[1];
-        tx[0][j] = ob->A[0]*dx+ob->A[1]*dy;
-        tx[1][j] = ob->A[2]*dx+ob->A[3]*dy;
-        if(   (x_[j]-ob->x[0].min)*(ob->x[0].max-x_[j]) < 0
-           || (y_[j]-ob->x[1].min)*(ob->x[1].max-y_[j]) < 0 )
-          failure=1,
-          printf("%d %d (%g,%g) not in [%g,%g] x [%g,%g]\n", n, j,
-            x_[j],y_[j], ob->x[0].min,ob->x[0].max, ob->x[1].min,ob->x[1].max);
-        if(   (tx[0][j]+1)*(1-tx[0][j]) < 0
-           || (tx[1][j]+1)*(1-tx[1][j]) < 0 )
-          failure=1,
-          printf("%d %d (%g,%g) not in [-1,1]^2\n", n, j,
-            tx[0][j],tx[1][j]);
-        if(failure) break;
-      }
-      
-      xr = dbl_range_expand(lob_bnd_2(lob_bnd_data_r,NR,MR,
-                                      lob_bnd_data_s,NS,MS, x_, work), TOL);
-      yr = dbl_range_expand(lob_bnd_2(lob_bnd_data_r,NR,MR,
-                                      lob_bnd_data_s,NS,MS, y_, work), TOL);
-
-      for(j=0;j<2;++j) tr[j] = dbl_range_expand(
-          lob_bnd_2(lob_bnd_data_r,NR,MR, lob_bnd_data_s,NS,MS, tx[j], work)
-        , TOL);
-        
-      if(   ob->x[0].min < xr.min - DBL_EPSILON*128
-         || ob->x[0].max > xr.max + DBL_EPSILON*128 ) failure = 1;
-      if(   ob->x[1].min < yr.min - DBL_EPSILON*128
-         || ob->x[1].max > yr.max + DBL_EPSILON*128 ) failure = 1;
-      
-      for(j=0;j<2;++j)
-        if(   tr[j].min > -1 + DBL_EPSILON*128
-           || tr[j].max <  1 - DBL_EPSILON*128 ) failure = 1;
-           
-      if((i==0&&n==0) || failure) {
-        printf("x: [%g,%g] in [%g,%g]\n", ob->x[0].min, ob->x[0].max,
-                                          xr.min, xr.max);
-        printf("y: [%g,%g] in [%g,%g]\n", ob->x[1].min, ob->x[1].max,
-                                          yr.min, yr.max);
-        for(j=0;j<2;++j)
-          printf("r %d: [%g,%g]\n", j, tr[j].min, tr[j].max);
-      }
-      if(failure) break;
-    }
-    if(failure) break;
-    printf("."); fflush(stdout);
-  }
-  printf("\n");
-
-  /* 3-D */
-  for(i=0;!failure && i<REPEAT;++i) {
-    unsigned n; double *x_ = x, *y_ = y, *z_ = z;
-    for(n=0;n<N && n<6;++n, x_+=NR*NS*NT, y_+=NR*NS*NT, z_+=NR*NS*NT)
-      bubble_elt(x_,y_,z_, zr,NR, zs,NS, zt,NT, n);
-    for(n=N-6;n;--n, x_+=NR*NS*NT, y_+=NR*NS*NT, z_+=NR*NS*NT)
-      rand_elt_3(x_,y_,z_, zr,NR, zs,NS, zt,NT);
-    obbox_calc_3(ob3, elx, nr,N, mr, TOL);
-    x_=x, y_=y, z_=z;
-    for(n=0;n<N;++n, x_+=NR*NS*NT, y_+=NR*NS*NT, z_+=NR*NS*NT) {
-      const struct obbox_3 *ob = &ob3[n];
-      struct dbl_range xr,yr,zr, tr[3];
-      static double work[2*MR*MS*(NT+MT+1)];
-      unsigned j;
-      for(j=0;j<NR*NS*NT;++j) {
-        const double dx=x_[j]-ob->c0[0], dy=y_[j]-ob->c0[1], dz=z_[j]-ob->c0[2];
-        tx[0][j] = ob->A[0]*dx+ob->A[1]*dy+ob->A[2]*dz;
-        tx[1][j] = ob->A[3]*dx+ob->A[4]*dy+ob->A[5]*dz;
-        tx[2][j] = ob->A[6]*dx+ob->A[7]*dy+ob->A[8]*dz;
-        if(   (x_[j]-ob->x[0].min)*(ob->x[0].max-x_[j]) < 0
-           || (y_[j]-ob->x[1].min)*(ob->x[1].max-y_[j]) < 0
-           || (z_[j]-ob->x[2].min)*(ob->x[2].max-z_[j]) < 0 )
-          failure=1,
-          printf("%d %d (%g,%g,%g) not in [%g,%g] x [%g,%g] x [%g,%g]\n", n, j,
-            x_[j],y_[j],z_[j], ob->x[0].min,ob->x[0].max,
-            ob->x[1].min,ob->x[1].max, ob->x[2].min,ob->x[2].max);
-        if(   (tx[0][j]+1)*(1-tx[0][j]) < 0
-           || (tx[1][j]+1)*(1-tx[1][j]) < 0
-           || (tx[2][j]+1)*(1-tx[2][j]) < 0 )
-          failure=1,
-          printf("%d %d (%g,%g,%g) not in [-1,1]^3\n", n, j,
-            tx[0][j],tx[1][j],tx[2][j]);
-        if(failure) break;
-      }
-      
-      xr = dbl_range_expand(lob_bnd_3(lob_bnd_data_r,NR,MR,
-                                      lob_bnd_data_s,NS,MS,
-                                      lob_bnd_data_t,NT,MT, x_, work), TOL);
-      yr = dbl_range_expand(lob_bnd_3(lob_bnd_data_r,NR,MR,
-                                      lob_bnd_data_s,NS,MS,
-                                      lob_bnd_data_t,NT,MT, y_, work), TOL);
-      zr = dbl_range_expand(lob_bnd_3(lob_bnd_data_r,NR,MR,
-                                      lob_bnd_data_s,NS,MS,
-                                      lob_bnd_data_t,NT,MT, z_, work), TOL);
-
-      for(j=0;j<3;++j) tr[j] = dbl_range_expand(
-          lob_bnd_3(lob_bnd_data_r,NR,MR, lob_bnd_data_s,NS,MS,
-                    lob_bnd_data_t,NT,MT, tx[j], work)
-        , TOL);
-        
-      if(   ob->x[0].min < xr.min - DBL_EPSILON*128
-         || ob->x[0].max > xr.max + DBL_EPSILON*128 ) failure = 1;
-      if(   ob->x[1].min < yr.min - DBL_EPSILON*128
-         || ob->x[1].max > yr.max + DBL_EPSILON*128 ) failure = 1;
-      if(   ob->x[2].min < zr.min - DBL_EPSILON*128
-         || ob->x[2].max > zr.max + DBL_EPSILON*128 ) failure = 1;
-      
-      for(j=0;j<3;++j)
-        if(   tr[j].min > -1 + DBL_EPSILON*128
-           || tr[j].max <  1 - DBL_EPSILON*128 ) failure = 1;
-           
-      if((i==0&&n==0) || failure) {
-        printf("x: [%g,%g] in [%g,%g]\n", ob->x[0].min, ob->x[0].max,
-                                          xr.min, xr.max);
-        printf("y: [%g,%g] in [%g,%g]\n", ob->x[1].min, ob->x[1].max,
-                                          yr.min, yr.max);
-        printf("z: [%g,%g] in [%g,%g]\n", ob->x[2].min, ob->x[2].max,
-                                          zr.min, zr.max);
-        for(j=0;j<3;++j)
-          printf("r %d: [%g,%g]\n", j, tr[j].min, tr[j].max);
-      }
-      if(failure) break;
-    }
-    if(failure) break;
-    printf("."); fflush(stdout);
-  }
-  printf("\n");
-        
-  free(lob_bnd_data_r);
- 
-  printf("Tests %s\n", failure?"failed":"successful");
-
-  return failure;
-}
diff --git a/3rdParty/gslib/tests/poly_test.c b/3rdParty/gslib/tests/poly_test.c
deleted file mode 100644
index b2caeef4a..000000000
--- a/3rdParty/gslib/tests/poly_test.c
+++ /dev/null
@@ -1,23 +0,0 @@
-#include <stdio.h>
-#include <float.h>
-#include "c99.h"
-#include "name.h"
-#include "types.h"
-#include "poly.h"
-
-int main()
-{
-  int i, n=13;
-  double z[50], w[50];
-  lobatto_quad(z,w,n);
-  /*
-  for(i=0;i<n;++i)
-    printf("%+20.*Lg\t%+20.*Lg\n",LDBL_DIG+1,(long double)z[i],
-                                  LDBL_DIG+1,(long double)w[i]);
-  */
-  for(i=0;i<n;++i)
-    printf("%+20.*g\t%+20.*g\n",DBL_DIG+1,z[i],
-                                DBL_DIG+1,w[i]);
-  return 0;
-}
-
diff --git a/3rdParty/gslib/tests/run_tests.sh b/3rdParty/gslib/tests/run_tests.sh
deleted file mode 100755
index 59930f17a..000000000
--- a/3rdParty/gslib/tests/run_tests.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/sh
-
-# Set the MPI command and number of MPI ranks
-MPI=mpirun
-np="1 2 3 4"
-
-# Build the tests if they have not been built
-# before.
-make -C .. CC=mpicc CFLAGS="-O2 -g" tests
-
-for i in *.o fortran/*.o; do
-  j=${i%.*}
-  for n in $np; do
-    $MPI -np $n ./$j >> test_log
-    if [ "$?" -eq 0 ]; then
-      echo "Running test: $j, np: $n ... Passed."
-    else
-      echo "Running test: $j, np: $n ... Failed."
-    fi
-  done
-done
diff --git a/3rdParty/gslib/tests/sarray_sort_test.c b/3rdParty/gslib/tests/sarray_sort_test.c
deleted file mode 100644
index 15fa780d9..000000000
--- a/3rdParty/gslib/tests/sarray_sort_test.c
+++ /dev/null
@@ -1,47 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <limits.h>
-#include "c99.h"
-#include "name.h"
-#include "fail.h"
-#include "types.h"
-#include "mem.h"
-#include "sort.h"
-#include "sarray_sort.h"
-
-int main()
-{
-  struct rec { double d; slong l; sint i; float f; };
-  buffer buf = {0,0,0};
-  struct rec rec[500];
-  uint i;
-  
-  for(i=0;i<500;++i) {
-    sint num1 = rand() & 0xff;
-    slong num2 = rand();
-    num2<<=(CHAR_BIT)*sizeof(int)-1;
-    num2|=rand();
-    num2<<=(CHAR_BIT)*sizeof(int)-1;
-    num2|=rand();
-    num2= num2<0?-num2:num2;
-    rec[i].d = num2;
-    rec[i].f = num2;
-    rec[i].l = num2;
-    rec[i].i = num1;
-  }
-  sarray_sort_2(struct rec,rec,500, i,0, l,1, &buf);
-  for(i=0;i<500;++i)
-    printf("%g\t%g\t%ld\t%d\n",
-      rec[i].d,rec[i].f,(long)rec[i].l,(int)rec[i].i);
-
-  printf("\n");
-  sarray_sort(struct rec,rec,500, l,1, &buf);
-  for(i=0;i<500;++i)
-    printf("%g\t%g\t%ld\t%d\n",
-      rec[i].d,rec[i].f,(long)rec[i].l,(int)rec[i].i);
-  buffer_free(&buf);
-  return 0;
-}
-
diff --git a/3rdParty/gslib/tests/sarray_transfer_test.c b/3rdParty/gslib/tests/sarray_transfer_test.c
deleted file mode 100644
index aaf3b7fd0..000000000
--- a/3rdParty/gslib/tests/sarray_transfer_test.c
+++ /dev/null
@@ -1,93 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include "c99.h"
-#include "name.h"
-#include "fail.h"
-#include "types.h"
-#include "comm.h"
-#include "mem.h"
-#include "sort.h"
-#include "sarray_sort.h"
-#include "crystal.h"
-#include "sarray_transfer.h"
-
-typedef struct {
-  double d;
-  ulong l,l2;
-  uint i;
-  uint p;
-} r_work;
-
-int main(int narg, char *arg[])
-{
-  comm_ext world; int np;
-  struct comm comm;
-  struct crystal crystal;
-  struct array A, A0=null_array; r_work *row, *row_0;
-  uint i;
-#ifdef MPI
-  MPI_Init(&narg,&arg);
-  world = MPI_COMM_WORLD;
-  MPI_Comm_size(world,&np);
-#else
-  world=0, np=1;
-#endif
-
-  comm_init(&comm,world);
-  crystal_init(&crystal,&comm);
-
-  array_init(r_work,&A,np*3), A.n=np*3, row=A.ptr;
-  for(i=0;i<A.n;++i) {
-    row[i].i = rand();
-    row[i].l = row[i].l2 = rand();
-    row[i].p = rand()%np;
-    row[i].d = rand()/(double)rand();
-  }
-  
-  sarray_sort_3(r_work,row,A.n, i,0, l,1, p,0, &crystal.data);
-  
-  for(i=0;i<A.n;++i)
-    printf("%02d send -> %02d: %08x %08x %d %g\n",
-      (int)comm.id,(int)row[i].p,(int)row[i].i,
-      (int)row[i].l,(int)row[i].p,row[i].d);
-  
-  array_cat(r_work,&A0, row,A.n);
-  
-  sarray_transfer(r_work,&A, p,1, &crystal);
-
-  row=A.ptr;
-  for(i=0;i<A.n;++i)
-    printf("%02d recv <- %02d: %08x %08x %d %g\n",
-      (int)comm.id,(int)row[i].p,(int)row[i].i,
-      (int)row[i].l,(int)row[i].p,row[i].d);
-
-  sarray_transfer(r_work,&A, p,1, &crystal);
-  sarray_sort_3(r_work,row,A.n, i,0, l,1, p,0, &crystal.data);
-  if(A.n!=A0.n)
-    fail(1,__FILE__,__LINE__,"final array has different length than original");
-  row=A.ptr, row_0=A0.ptr;
-  for(i=0;i<A.n;++i)
-    if(   row[i].d != row_0[i].d
-       || row[i].l != row_0[i].l
-       || row[i].l2!= row_0[i].l2
-       || row[i].i != row_0[i].i
-       || row[i].p != row_0[i].p)
-      fail(1,__FILE__,__LINE__,"final array differs from original");
-      
-  array_free(&A0);
-  array_free(&A);
-  crystal_free(&crystal);
-
-  fflush(stdout); comm_barrier(&comm);
-  if(comm.id==0) printf("tests passed\n"), fflush(stdout);
-  
-  comm_free(&comm);
-  
-#ifdef MPI
-  MPI_Finalize();
-#endif
-
-  return 0;
-}
diff --git a/3rdParty/gslib/tests/sort_test.c b/3rdParty/gslib/tests/sort_test.c
deleted file mode 100644
index acd0bb387..000000000
--- a/3rdParty/gslib/tests/sort_test.c
+++ /dev/null
@@ -1,113 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <limits.h>
-#include <string.h>
-#include "c99.h"
-#include "name.h"
-#include "fail.h"
-#include "types.h"
-#include "mem.h"
-#include "sort.h"
-
-#define SMALL 22
-#define NUM   500
-#define SI 9
-
-ulong A[NUM][SI], Av[NUM];
-uint  B[NUM][SI], Bv[NUM];
-
-uint P[NUM], Q[NUM];
-
-int main()
-{
-  buffer buf = {0,0,0};
-  uint i;
-
-  /*buffer_init(&buf, sortp_long_worksize(NUM,0));*/
-
-#if 0
-  printf("\nsource:\n");
-#endif
-  for(i=0;i!=NUM;++i) {
-    A[i][0]=rand();
-    A[i][0]<<=CHAR_BIT*sizeof(int)-1;
-    A[i][0]^=rand();
-    A[i][0]<<=CHAR_BIT*sizeof(int)-1;
-    A[i][0]^=rand();
-    if(0) A[i][0]&=0x000ff00;
-    B[i][0]=A[i][0];
-#if 0    
-    printf("%016lx\t%016lx\n",(unsigned long)A[i][0],(unsigned long)B[i][0]);
-#endif
-  }
-#if 0
-  printf("\n");
-#endif
-  printf("merge sort:\n");
-  for(i=0;i!=SMALL;++i) Q[i]=SMALL-1-i;
-  sortv_long(Av,  &A[0][0],SMALL,sizeof(ulong[SI]), &buf);
-  sortp_long(&buf,0, &A[0][0],SMALL,sizeof(ulong[SI]));
-    memcpy(P,buf.ptr,SMALL*sizeof(uint));
-  memcpy(buf.ptr,Q,SMALL*sizeof(uint));
-  sortp_long(&buf,1, &A[0][0],SMALL,sizeof(ulong[SI]));
-    memcpy(Q,buf.ptr,SMALL*sizeof(uint));
-  for(i=0;i!=SMALL;++i)
-    printf("%u\t%u\t%016lx\t%d\t%d\n",(unsigned)P[i],(unsigned)Q[i],
-           (unsigned long)A[P[i]][0],
-           A[P[i]][0]==A[Q[i]][0],
-           Av[i]==A[P[i]][0]);
-  printf("\n");
-  printf("radix sort:\n");
-  for(i=0;i!=NUM;++i) Q[i]=NUM-1-i;
-  sortv_long(Av,  &A[0][0],NUM,sizeof(ulong[SI]), &buf);
-  sortp_long(&buf,0, &A[0][0],NUM,sizeof(ulong[SI]));
-    memcpy(P,buf.ptr,NUM*sizeof(uint));
-  memcpy(buf.ptr,Q,NUM*sizeof(uint));
-  sortp_long(&buf,1, &A[0][0],NUM,sizeof(ulong[SI]));
-    memcpy(Q,buf.ptr,NUM*sizeof(uint));
-  for(i=0;i!=NUM;++i)
-    printf("%u\t%u\t%016lx\t%d\t%d\n",(unsigned)P[i],(unsigned)Q[i],
-           (unsigned long)A[P[i]][0],
-           A[P[i]][0]==A[Q[i]][0],
-           Av[i]==A[P[i]][0]);
-
-  printf("\nsmall integers:\n");
-  printf("\n");
-
-  printf("heap sort:\n");
-  for(i=0;i!=SMALL;++i) Q[i]=SMALL-1-i;
-  sortv(Q,  Q,SMALL,sizeof(uint), &buf);
-  for(i=0;i!=SMALL;++i) printf("\t%u\n",(unsigned)Q[i]);
-
-  printf("merge sort:\n");
-  for(i=0;i!=SMALL;++i) Q[i]=SMALL-1-i;
-  sortv(Bv,  &B[0][0],SMALL,sizeof(uint[SI]), &buf);
-  sortp(&buf,0, &B[0][0],SMALL,sizeof(uint[SI]));
-    memcpy(P,buf.ptr,SMALL*sizeof(uint));
-  memcpy(buf.ptr,Q,SMALL*sizeof(uint));
-  sortp(&buf,1, &B[0][0],SMALL,sizeof(uint[SI]));
-    memcpy(Q,buf.ptr,SMALL*sizeof(uint));
-  for(i=0;i!=SMALL;++i)
-    printf("%u\t%u\t%016lx\t%d\t%d\n",(unsigned)P[i],(unsigned)Q[i],
-           (unsigned long)B[P[i]][0],
-           B[P[i]][0]==B[Q[i]][0],
-           B[P[i]][0]==Bv[i]);
-  printf("\n");
-  printf("radix sort:\n");
-  for(i=0;i!=NUM;++i) Q[i]=NUM-1-i;
-  sortv(Bv,  &B[0][0],NUM,sizeof(uint[SI]), &buf);
-  sortp(&buf,0, &B[0][0],NUM,sizeof(uint[SI]));
-    memcpy(P,buf.ptr,NUM*sizeof(uint));
-  memcpy(buf.ptr,Q,NUM*sizeof(uint));
-  sortp(&buf,1, &B[0][0],NUM,sizeof(uint[SI]));
-    memcpy(Q,buf.ptr,NUM*sizeof(uint));
-  for(i=0;i!=NUM;++i)
-    printf("%u\t%u\t%016lx\t%d\t%d\n",(unsigned)P[i],(unsigned)Q[i],
-           (unsigned long)B[P[i]][0],
-           B[P[i]][0]==B[Q[i]][0],
-           B[P[i]][0]==Bv[i]);
-  buffer_free(&buf);
-  return 0;
-}
-
diff --git a/3rdParty/gslib/tests/sort_test2.c b/3rdParty/gslib/tests/sort_test2.c
deleted file mode 100644
index d3ed601bb..000000000
--- a/3rdParty/gslib/tests/sort_test2.c
+++ /dev/null
@@ -1,58 +0,0 @@
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <limits.h>
-#include <string.h>
-#include "c99.h"
-#include "name.h"
-#include "fail.h"
-#include "types.h"
-#include "mem.h"
-#include "sort.h"
-
-#if 1
-
-#define N (1<<20)
-
-ulong A[N], out[N];
-uint P[N];
-
-int main()
-{
-  buffer buf = null_buffer;
-  uint i;
-  unsigned long long tic, toc;
-  unsigned r;
-
-  for(i=0;i!=N;++i) {
-    A[i]=rand();
-    A[i]<<=CHAR_BIT*sizeof(int)-1;
-    A[i]^=rand();
-    A[i]<<=CHAR_BIT*sizeof(int)-1;
-    A[i]^=rand();
-    if(0) A[i]&=0x000ff00;
-  }
-
-  for(i=N;i;i>>=1) {
-    unsigned long long t;
-    sortv_long(out, A,i,sizeof(ulong), &buf);
-  }
-
-  for(i=N;i;i>>=1) {
-    unsigned long long t;
-    sortp_long(&buf,0, A,i,sizeof(ulong));
-  }
-
-  buffer_free(&buf);
-  return 0;
-}
-
-#else
-
-int main()
-{
-  return 0;
-}
-
-#endif
-
diff --git a/LICENSE b/LICENSE
index 87572be49..135c3fa96 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 The MIT License (MIT)
 
-Copyright (c) 2017-2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
index b57f12847..796a2c49e 100644
--- a/README.md
+++ b/README.md
@@ -16,11 +16,10 @@ If you use any part of libParanumal in your research project including variants
 @MISC{ChalmersKarakusAustinSwirydowiczWarburton2020,
       author = "Chalmers, N. and Karakus, A. and Austin, A. P. and Swirydowicz, K. and Warburton, T.",
       title = "{libParanumal}: a performance portable high-order finite element library",
-      year = "2020",
+      year = "2022",
       url = "https://github.com/paranumal/libparanumal",
       doi = "10.5281/zenodo.4004744",
-      note = "Release 0.4.0"
-      }
+      note = "Release 0.5.0"}
 </pre>
 
 see the [references](#10-references) section below for additional papers to reference about various aspects of the library.
@@ -42,7 +41,9 @@ A. Supported elements:
 
 B. Mesh wrangling:
   - Gmsh format file loaders.
-  - Load balanced geometric partitioning using space filling curves (Hilbert or Morton ordering).
+  - Load balanced inertial partitioning.
+  - Load balanced multi-level spectral partitioning.
+  - Cuthill-Mckee local ordering
 
 C. Time integrators:
   - Adaptive rate Dormand-Prince order 5 Runge-Kutta.
@@ -59,7 +60,7 @@ D. Iterative linear solvers:
 E. Elliptic solver:
   - Linear Poisson and screened Poisson potential solvers.
   - GPU-optimized matrix-vector products.
-  - p-type multigrid, algebraic multigrid, low-order SEMFEM, Overlapping Additive Schwarz, and Jacobi preconditioning.
+  - p-type multigrid, algebraic multigrid (smoothed and unsmoothed aggregation), low-order SEMFEM, Overlapping Additive Schwarz, and Jacobi preconditioning.
   - Matrix-free p-multigrid for fine levels of multigrid hierarchy.
 
 F. Heterogeneous accelerated flow solvers:
@@ -75,45 +76,51 @@ F. Heterogeneous accelerated flow solvers:
      * Extrapolation-BDF integration in time.
      * Sub-cycling (Operator Integration Factor Splitting) for advection.
 
-G. Dependencies:
+G. Portability:
+  - Ships with the Open Concurrent Compute Abstraction (OCCA)
+  - At build time, OCCA will try to detect if any of these execution models are installed: OpenMP, CUDA, OpenCL, HIP, and/or SYCL.
+  - Execution model can be selected at runtime. 
+    - If OCCA does not detect a chosen mode of execution it will default to Serial execution.
+    - You will need to adjust the libParnumal setup input files to choose the execution model and compute device appropriate for your system.
+      
+H. Dependencies:
    - Message Passing Interface (MPI v3.0 or higher).
       * The libParanumal makefiles assume that mpic++ is installed and visible in your path.
-   - Open Concurrent Compute Abstraction (OCCA)
-      * OCCA must be installed.
-      * OCCA will try to detect if any of these execution models are installed: OpenMP, CUDA, OpenCL, and/or HIP.
-      * By default, if OCCA does not detect a chosen mode of execution it will default to Serial execution.
-      * You will need to adjust the libParnumal setup input files to choose the execution model and compute device appropriate for your system.
-      * The OCCA github repo is [here](https://github.com/libocca/occa)
-      * The OCCA webpage is [here](http://libocca.org)
+   
 
 
 ---
 ### 4. Code block diagram
-<img src="http://intranet.math.vt.edu/people/tcew/libPdiagramCropV2.jpg" width="512" >
+![libParnumal Code Diagram](./.github/CodeDiagram.png)
 
 ---
 ### 5. OCCA dependency
-OCCA is held as a git submodule inside libParanumal. If you did not clone with `--recursive` then run the following command before building.
-`git submodule init`
-`git submodule update`
+OCCA is held as a git submodule inside libParanumal. If you did not clone with `--recursive` then run the following commands before building.
+```
+git submodule init
+git submodule update
+```
 
 ---
 ### 6. Required Libraries
-libParanumal requires installed BLAS and LAPACK libraries. By default, the build system will look for `libblas` and `liblapack` in your default library search paths. The library paths can also be manually specified in `make.top` with the `LIBP_BLAS_DIR` and `LIBP_LAPACK_DIR` variables.
+libParanumal requires installed BLAS and LAPACK libraries. By default, the build system will look for a serial (i.e. non-threaded) OpenBLAS in your default library search paths. The BLAS and LAPACK library paths can also be manually specified in `make.top` with the `LIBP_BLAS_DIR` and `LIBP_BLAS_LIB` variables.
 
-Some Linux distributions will package BLAS and LAPACK libraries. For example, on Ubuntu systems these libraries can be installed via
-```sudo apt install libblas-dev liblapack-dev```
-
-
-libParanumal also depends on the [gslib](https://github.com/Nek5000/gslib) library for gather-scatter operations. For more information on gslib see [Henry Tufo's thesis](https://dl.acm.org/doi/book/10.5555/926758) and a more recent reference [Fischer et al.](https://iopscience.iop.org/article/10.1088/1742-6596/125/1/012076/meta). The source code for gslib is included in this repository.
+Some Linux distributions will package a serial OpenBLAS library. For example, on Ubuntu systems this libraries can be installed via
+```
+sudo apt install libopenblas-serial-dev
+```
 
 ---
 ### 7. Clone: libParanumal
-`git clone https://github.com/paranumal/libparanumal`
+```
+git clone --recursive https://github.com/paranumal/libparanumal
+```
 
 #### 7-1. Build all libParanumal solvers
-`cd libparanumal`
-```make -j `nproc` ```
+```
+cd libparanumal
+make -j `nproc`
+```
 
 ---
 ### 8. Running the codes:
@@ -122,16 +129,30 @@ Each solver resides in its respective sub-directory in `solvers/`. Each solver s
 
 #### 8-1. Build libParanumal elliptic solver
 
-`cd libparanumal/solvers/elliptic`
-```make -j `nproc` ```
+```
+cd libparanumal/solvers/elliptic
+make -j `nproc` 
+```
 
 #### 8-2. Run elliptic example with provided quadrilateral set up file on a single device:
 
-`./ellipticMain setups/setupQuad2D.rc`
+libParanumal will make use of extra CPU cores if available. It is therefore beneficial to bind the MPI process to several CPU cores, if possible. For example, running the libParanumal elliptic solver with OpenMPI on a system with 16 CPU cores can be done via
+
+```
+mpiexec -np 1 --map-by slot:PE=16 ./ellipticMain setups/setupQuad2D.rc
+```
+
+The number of CPU cores used can also be controlled with the `OMP_NUM_THREADS` environment variable. libParanumal will not use more threads then there are physical CPU cores on the system, however, even in the presence of this environment variable.
 
 #### 8-3. Run the same example with four devices:
 
-`mpiexec -n 4 ./ellipticMain setups/setupQuad2D.rc`
+As the number of MPI processes per system increases, it is advisable to reduce the number of CPU cores per process to avoid oversubscribing the CPU cores. Using the same example above of the libParanumal elliptic solver with OpenMPI on a system with 16 CPU cores, a four rank run could be done via
+
+```
+mpiexec -np 4 --map-by slot:PE=4 ./ellipticMain setups/setupQuad2D.rc
+```
+
+i.e. each process binds to four of the 16 CPU cores available.
 
 ---
 
@@ -139,7 +160,7 @@ Each solver resides in its respective sub-directory in `solvers/`. Each solver s
 
 The MIT License (MIT)
 
-Copyright (c) 2017-2021 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -182,6 +203,10 @@ Low-order preconditioning of triangular elements (elliptic precon): [publisher](
 
 ### 11. Technical Reports
 
+CEED MS38: [link](https://doi.org/10.5281/zenodo.6475857): `Kolev, T., Fischer, P., Abdelfattah, A.,  Beams, N.,  Brown, J.,  Camier, J.-S., Carson, R., Chalmers, N.,  Dobrev, V.,  Dudouit, Y.,  Ghaffari, L., Joshi, A. Y., Kerkemeier, S., Lan, Y.-H., McDougall, D., Medina, D.,  Min, M., Mishra, A.,  Pazner, W., Phillips, M.,  Ratnayaka, T., Shephard, M. S., Siboni, M. H.,  Smith, C. W.,  Thompson, J. L., Tomboulides, A.,  Tomov, S., Tomov, V., Warburton, T., 2022. ECP Milestone Report: High-order algorithmic developments and optimizations for more robust exascale applications, WBS 2.2.6.06, Milestone CEED-MS38.`
+
+CEED MS37: [link](https://doi.org/10.5281/zenodo.5542244): `Kolev, T., Fischer, P.,  Beams, N.,  Brown, J.,  Camier, J.-S., Chalmers, N.,  Dobrev, V.,  Dudouit, Y., Kerkemeier, S., Lan, Y.-H., Lin, Y., Lindquist, N., McDougall, D., Medina, D., Merzari, E.,  Min, M., Moe, S.,  Pazner, W., Phillips, M.,  Ratnayaka, T., Rowe, K., Shephard, M. S.,  Smith, C. W.,  Tomov, S., Warburton, T., 2022. CEED ECP Milestone Report: Port and optimize the CEED software stack to Aurora / Frontier EA Systems, WBS 2.2.6.06, Milestone CEED-MS37.`
+
 CEED MS36: [link](https://doi.org/10.5281/zenodo.4672664): `Kolev, T., Fischer, P., Austin, A.P., Barker, A.T., Beams, N.,   Brown, J., Camier, J.-S., Chalmers, N.,   Dobrev, V., Dudouit, Y.,  Ghaffari, L., Kerkemeier, S.,  Lan, Y.-H., Merzari, E.,  Min, M.,   Pazner, W., Ratnayaka, T., Shephard, M. S., Siboni, M.H.,   Smith, C.W.,   Thompson, J.L.,   Tomov, S., Warburton, T., 2021. ECP Milestone Report: High-order algorithmic developments and optimizations for large-scale GPU-accelerated simulations, WBS 2.2.6.06, Milestone CEED-MS36.`
 
 CEED MS35: [link](https://doi.org/10.5281/zenodo.4146400): `Kolev, T., Fischer, P.,  Abdelfattah, A.,  Barra, V.,  Beams, N.,  Brown, J., Camier, J.S., Chalmers, N.,  Dobrev, V., Kerkemeier, S., Lan, Y.H., Merzari, E.,  Min, M., Phillips, M., Ratnayaka, T., Rowe, K.,  Thompson, J., Tomboulides, A.,  Tomov, S., Tomov, V,. and Warburton, T., 2020. ECP Milestone Report: Support CEED-enabled ECP applications in their preparation for Aurora/Frontier, WBS 2.2.6.06, Milestone CEED-MS35.`
diff --git a/include/comm.hpp b/include/comm.hpp
new file mode 100644
index 000000000..4cd03dddc
--- /dev/null
+++ b/include/comm.hpp
@@ -0,0 +1,565 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#ifndef LIBP_COMM_HPP
+#define LIBP_COMM_HPP
+
+#include <mpi.h>
+#include "core.hpp"
+
+namespace libp {
+
+#define MAX_PROCESSOR_NAME MPI_MAX_PROCESSOR_NAME
+
+/*Generic data type*/
+template<typename T>
+struct mpiType {
+  static MPI_Datatype getMpiType() {
+    MPI_Datatype type;
+    MPI_Type_contiguous(sizeof(T), MPI_CHAR, &type);
+    MPI_Type_commit(&type);
+    return type;
+  }
+  static void freeMpiType(MPI_Datatype type) {
+    MPI_Type_free(&type);
+  }
+  static constexpr bool isMpiType() { return false; }
+};
+
+/*Pre-defined MPI datatypes*/
+#define TYPE(T, MPI_T)                               \
+template<> struct mpiType<T> {                       \
+  static MPI_Datatype getMpiType() { return MPI_T; } \
+  static void freeMpiType(MPI_Datatype type) { }     \
+  static constexpr bool isMpiType() { return true; } \
+}
+
+TYPE(char,   MPI_CHAR);
+TYPE(int,    MPI_INT);
+TYPE(long long int, MPI_LONG_LONG_INT);
+TYPE(float,  MPI_FLOAT);
+TYPE(double, MPI_DOUBLE);
+#undef TYPE
+
+class comm_t;
+
+namespace Comm {
+
+  using request_t = MPI_Request;
+
+  /*Predefined ops*/
+  using op_t = MPI_Op;
+  static constexpr op_t Max  = MPI_MAX;
+  static constexpr op_t Min  = MPI_MIN;
+  static constexpr op_t Sum  = MPI_SUM;
+  static constexpr op_t Prod = MPI_PROD;
+  static constexpr op_t And  = MPI_LAND;
+  static constexpr op_t Or   = MPI_LOR;
+  static constexpr op_t Xor  = MPI_LXOR;
+
+  /*MPI_Init and MPI_Finalize*/
+  void Init(int &argc, char** &argv);
+  void Finalize();
+
+  /*handle to MPI_COMM_WORLD*/
+  comm_t World();
+
+  void GetProcessorName(char* name, int &namelen);
+
+} //namespace Comm
+
+/*Communicator class*/
+class comm_t {
+
+ private:
+  std::shared_ptr<MPI_Comm> comm_ptr;
+  int _rank=0;
+  int _size=0;
+
+ public:
+  comm_t() = default;
+  comm_t(const comm_t &c) = default;
+  comm_t& operator = (const comm_t &c)=default;
+
+  /*MPI_Comm_dup and MPI_Comm_delete*/
+  comm_t Dup() const;
+  comm_t Split(const int color, const int key) const;
+  void Free();
+
+  /*Rank and size getters*/
+  const int rank() const;
+  const int size() const;
+
+  /*MPI_Comm getter*/
+  MPI_Comm comm() const;
+
+  /*libp::memory send*/
+  template <template<typename> class mem, typename T>
+  void Send(mem<T> m,
+            const int dest,
+            const int count=-1,
+            const int tag=0) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    const int cnt = (count==-1) ? static_cast<int>(m.length()) : count;
+    MPI_Send(m.ptr(), cnt, type, dest, tag, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory recv*/
+  template <template<typename> class mem, typename T>
+  void Recv(mem<T> m,
+            const int source,
+            const int count=-1,
+            const int tag=0) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    const int cnt = (count==-1) ? static_cast<int>(m.length()) : count;
+    MPI_Recv(m.ptr(), cnt, type, source, tag, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*scalar send*/
+  template <typename T>
+  void Send(T& val,
+            const int dest,
+            const int tag=0) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Send(&val, 1, type, dest, tag, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*scalar recv*/
+  template <typename T>
+  void Recv(T& val,
+            const int source,
+            const int tag=0) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Recv(&val, 1, type, source, tag, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory non-blocking send*/
+  template <template<typename> class mem, typename T>
+  void Isend(mem<T> m,
+             const int dest,
+             const int count,
+             const int tag,
+             Comm::request_t &request) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Isend(m.ptr(), count, type, dest, tag, comm(), &request);
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory non-blocking recv*/
+  template <template<typename> class mem, typename T>
+  void Irecv(mem<T> m,
+             const int source,
+             const int count,
+             const int tag,
+             Comm::request_t &request) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Irecv(m.ptr(), count, type, source, tag, comm(), &request);
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*scalar non-blocking send*/
+  template <typename T>
+  void Isend(T& val,
+             const int dest,
+             const int tag,
+             Comm::request_t &request) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Isend(&val, 1, type, dest, tag, comm(), &request);
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*scalar non-blocking recv*/
+  template <typename T>
+  void Irecv(T& val,
+             const int source,
+             const int tag,
+             Comm::request_t &request) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Irecv(&val, 1, type, source, tag, comm(), &request);
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory broadcast*/
+  template <template<typename> class mem, typename T>
+  void Bcast(mem<T> m,
+             const int root,
+             const int count=-1) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    const int cnt = (count==-1) ? static_cast<int>(m.length()) : count;
+    MPI_Bcast(m.ptr(), cnt, type, root, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*scalar broadcast*/
+  template <typename T>
+  void Bcast(T& val,
+             const int root) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Bcast(&val, 1, type, root, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory reduce*/
+  template <template<typename> class mem, typename T>
+  void Reduce(const mem<T> snd,
+                    mem<T> rcv,
+              const int root,
+              const Comm::op_t op = Comm::Sum,
+              const int count=-1) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    const int cnt = (count==-1) ? static_cast<int>(snd.length()) : count;
+    MPI_Reduce(snd.ptr(), rcv.ptr(), cnt, type, op, root, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory in-place reduce*/
+  template <template<typename> class mem, typename T>
+  void Reduce(mem<T> m,
+              const int root,
+              const Comm::op_t op = Comm::Sum,
+              const int count=-1) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    const int cnt = (count==-1) ? static_cast<int>(m.length()) : count;
+    if (_rank==root) {
+      MPI_Reduce(MPI_IN_PLACE, m.ptr(), cnt, type, op, root, comm());
+    } else {
+      MPI_Reduce(m.ptr(), nullptr, cnt, type, op, root, comm());
+    }
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*scalar reduce*/
+  template <typename T>
+  void Reduce(const T& snd,
+                    T& rcv,
+              const int root,
+              const Comm::op_t op = Comm::Sum) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Reduce(&snd, &rcv, 1, type, op, root, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+  template <typename T>
+  void Reduce(T& val,
+              const int root,
+              const Comm::op_t op = Comm::Sum) const {
+    T rcv=val;
+    Reduce(val, rcv, root, op);
+    if (rank()==root) val=rcv;
+  }
+
+  /*libp::memory allreduce*/
+  template <template<typename> class mem, typename T>
+  void Allreduce(const mem<T> snd,
+                       mem<T> rcv,
+                 const Comm::op_t op = Comm::Sum,
+                 const int count=-1) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    const int cnt = (count==-1) ? static_cast<int>(snd.length()) : count;
+    MPI_Allreduce(snd.ptr(), rcv.ptr(), cnt, type, op, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory in-place allreduce*/
+  template <template<typename> class mem, typename T>
+  void Allreduce(mem<T> m,
+                 const Comm::op_t op = Comm::Sum,
+                 const int count=-1) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    const int cnt = (count==-1) ? static_cast<int>(m.length()) : count;
+    MPI_Allreduce(MPI_IN_PLACE, m.ptr(), cnt, type, op, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*scalar allreduce*/
+  template <typename T>
+  void Allreduce(const T& snd,
+                       T& rcv,
+                 const Comm::op_t op = Comm::Sum) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Allreduce(&snd, &rcv, 1, type, op, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+  template <typename T>
+  void Allreduce(T& val,
+                 const Comm::op_t op = Comm::Sum) const {
+    T rcv=val;
+    Allreduce(val, rcv, op);
+    val = rcv;
+  }
+
+  /*libp::memory non-blocking allreduce*/
+  template <template<typename> class mem, typename T>
+  void Iallreduce(const mem<T> snd,
+                        mem<T> rcv,
+                  const Comm::op_t op,
+                  const int count,
+                  Comm::request_t &request) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Iallreduce(snd.ptr(), rcv.ptr(), count, type, op, comm(), &request);
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory non-blocking in-place allreduce*/
+  template <template<typename> class mem, typename T>
+  void Iallreduce(mem<T> m,
+                  const Comm::op_t op,
+                  const int count,
+                  Comm::request_t &request) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Iallreduce(MPI_IN_PLACE, m.ptr(), count, type, op, comm(), &request);
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*scalar non-blocking allreduce*/
+  template <template<typename> class mem, typename T>
+  void Iallreduce(const T& snd,
+                        T& rcv,
+                  const Comm::op_t op,
+                  Comm::request_t &request) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Iallreduce(&snd, &rcv, 1, type, op, comm(), &request);
+    mpiType<T>::freeMpiType(type);
+  }
+  /*scalar non-blocking in-place allreduce*/
+  template <template<typename> class mem, typename T>
+  void Iallreduce(T& val,
+                  const Comm::op_t op,
+                  Comm::request_t &request) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Iallreduce(MPI_IN_PLACE, &val, 1, type, op, comm(), &request);
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory scan*/
+  template <template<typename> class mem, typename T>
+  void Scan(const mem<T> snd,
+                  mem<T> rcv,
+            const Comm::op_t op = Comm::Sum,
+            const int count=-1) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    const int cnt = (count==-1) ? static_cast<int>(snd.length()) : count;
+    MPI_Scan(snd.ptr(), rcv.ptr(), cnt, type, op, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory in-place scan*/
+  template <template<typename> class mem, typename T>
+  void Scan(mem<T> m,
+            const Comm::op_t op = Comm::Sum,
+            const int count=-1) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    const int cnt = (count==-1) ? static_cast<int>(m.length()) : count;
+    MPI_Scan(MPI_IN_PLACE, m.ptr(), cnt, type, op, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*scalar scan*/
+  template <typename T>
+  void Scan(const T& snd,
+                  T& rcv,
+            const Comm::op_t op = Comm::Sum) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Scan(&snd, &rcv, 1, type, op, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory gather*/
+  template <template<typename> class mem, typename T>
+  void Gather(const mem<T> snd,
+                    mem<T> rcv,
+              const int root,
+              const int sendCount=-1) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    const int cnt = (sendCount==-1) ? static_cast<int>(snd.length()) : sendCount;
+    MPI_Gather(snd.ptr(), cnt, type,
+               rcv.ptr(), cnt, type, root, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory gatherv*/
+  template <template<typename> class mem, typename T>
+  void Gatherv(const mem<T> snd,
+               const int sendcount,
+                     mem<T> rcv,
+               const memory<int> recvCounts,
+               const memory<int> recvOffsets,
+               const int root) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Gatherv(snd.ptr(), sendcount, type,
+                rcv.ptr(), recvCounts.ptr(), recvOffsets.ptr(), type,
+                root, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*scalar gather*/
+  template <template<typename> class mem, typename T>
+  void Gather(const T& snd,
+                    mem<T> rcv,
+              const int root) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Gather(&snd,      1, type,
+               rcv.ptr(), 1, type, root, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory scatter*/
+  template <template<typename> class mem, typename T>
+  void Scatter(const mem<T> snd,
+                     mem<T> rcv,
+               const int root,
+               const int count=-1) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    const int cnt = (count==-1) ? static_cast<int>(rcv.length()) : count;
+    MPI_Scatter(snd.ptr(), cnt, type,
+                rcv.ptr(), cnt, type, root, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory scatterv*/
+  template <template<typename> class mem, typename T>
+  void Scatterv(const mem<T> snd,
+                const memory<int> sendCounts,
+                const memory<int> sendOffsets,
+                      mem<T> rcv,
+                const int recvcount,
+                const int root) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Scatterv(snd.ptr(), sendCounts.ptr(), sendOffsets.ptr(), type,
+                 rcv.ptr(), recvcount, type,
+                 root, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*scalar scatter*/
+  template <template<typename> class mem, typename T>
+  void Scatter(T& rcv,
+               const mem<T> snd,
+               const int root) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Scatter(snd.ptr,   1, type,
+                &rcv,      1, type, root, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory allgather*/
+  template <template<typename> class mem, typename T>
+  void Allgather(const mem<T> snd,
+                       mem<T> rcv,
+                 const int sendCount=-1) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    const int cnt = (sendCount==-1) ? static_cast<int>(snd.length()) : sendCount;
+    MPI_Allgather(snd.ptr(), cnt, type,
+                  rcv.ptr(), cnt, type, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+  template <template<typename> class mem, typename T>
+  void Allgather(mem<T> m,
+                 const int cnt) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Allgather(MPI_IN_PLACE, cnt, type,
+                  m.ptr(),      cnt, type, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory allgatherv*/
+  template <template<typename> class mem, typename T>
+  void Allgatherv(const mem<T> snd,
+                  const int sendcount,
+                        mem<T> rcv,
+                  const memory<int> recvCounts,
+                  const memory<int> recvOffsets) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Allgatherv(snd.ptr(), sendcount, type,
+                   rcv.ptr(), recvCounts.ptr(), recvOffsets.ptr(), type,
+                   comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*scalar allgather*/
+  template <template<typename> class mem, typename T>
+  void Allgather(const T& snd,
+                       mem<T> rcv) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Allgather(&snd,      1, type,
+                  rcv.ptr(), 1, type, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory alltoall*/
+  template <template<typename> class mem, typename T>
+  void Alltoall(const mem<T> snd,
+                      mem<T> rcv,
+                const int cnt=1) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Alltoall(snd.ptr(), cnt, type,
+                 rcv.ptr(), cnt, type, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory alltoallv*/
+  template <template<typename> class mem, typename T>
+  void Alltoallv(const mem<T> snd,
+                 const memory<int> sendCounts,
+                 const memory<int> sendOffsets,
+                       mem<T> rcv,
+                 const memory<int> recvCounts,
+                 const memory<int> recvOffsets) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Alltoallv(snd.ptr(), sendCounts.ptr(), sendOffsets.ptr(), type,
+                  rcv.ptr(), recvCounts.ptr(), recvOffsets.ptr(), type,
+                  comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  template <template<typename> class mem, typename T>
+  void Ialltoallv(const mem<T> snd,
+                  const memory<int> sendCounts,
+                  const memory<int> sendOffsets,
+                        mem<T> rcv,
+                  const memory<int> recvCounts,
+                  const memory<int> recvOffsets,
+                  Comm::request_t &request) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Ialltoallv(snd.ptr(), sendCounts.ptr(), sendOffsets.ptr(), type,
+                  rcv.ptr(), recvCounts.ptr(), recvOffsets.ptr(), type,
+                  comm(), &request);
+    mpiType<T>::freeMpiType(type);
+  }
+
+  void Wait(Comm::request_t &request) const;
+  void Waitall(const int count, memory<Comm::request_t> &requests) const;
+  void Barrier() const;
+
+  friend comm_t Comm::World();
+};
+
+} //namespace libp
+
+#endif
diff --git a/include/core.hpp b/include/core.hpp
index 0319e9d87..6cbb04d8c 100644
--- a/include/core.hpp
+++ b/include/core.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,58 +27,28 @@ SOFTWARE.
 #ifndef CORE_HPP
 #define CORE_HPP
 
-#include <mpi.h>
-#include <occa.hpp>
-#include <cstring>
-#include <string>
-#include <cmath>
-#include <algorithm>
 #include "utils.hpp"
+#include "memory.hpp"
+#include "comm.hpp"
 
-// sort entries in an array in parallel
-void parallelSort(int size, int rank, MPI_Comm comm,
-      int N, void *vv, size_t sz,
-      int (*compare)(const void *, const void *),
-      void (*match)(void *, void *)
-      );
+namespace libp {
 
 // find a factorization n = nx*ny such that
 //  nx>=ny are 'close' to one another
-void factor2(const int n, int &nx, int &ny);
+void Factor2(const int n, int &nx, int &ny);
+
+void RankDecomp2(int  size_x, int  size_y,
+                 int &rank_x, int &rank_y,
+                 const int rank);
 
 // find a factorization n = nx*ny*nz such that
 //  nx>=ny>=nz are all 'close' to one another
-void factor3(const int n, int &nx, int &ny, int &nz);
-
-void matrixRightSolve(int NrowsA, int NcolsA, double *A, int NrowsB, int NcolsB, double *B, double *C);
-void matrixRightSolve(int NrowsA, int NcolsA, float *A, int NrowsB, int NcolsB, float *B, float *C);
-void matrixUnderdeterminedRightSolveMinNorm(int NrowsA, int NcolsA, dfloat *A, dfloat *b, dfloat *x);
-void matrixUnderdeterminedRightSolveCPQR(int NrowsA, int NcolsA, dfloat *A, dfloat *b, dfloat *x);
-
-void matrixEigenVectors(int N, double *A, double *VR, double *WR, double *WI);
-void matrixEigenVectors(int N, float *A, float *VR, float *WR, float *WI);
-
-void matrixEigenValues(int N, double *A, double *WR, double *WI);
-void matrixEigenValues(int N, float *A, float *WR, float *WI);
-
-void matrixInverse(int N, double *A);
-void matrixInverse(int N, float *A);
-
-double matrixConditionNumber(int N, double *A);
-float  matrixConditionNumber(int N, float *A);
+void Factor3(const int n, int &nx, int &ny, int &nz);
 
-void matrixTranspose(const int M, const int N,
-                     const double  *A, const int LDA,
-                           double *AT, const int LDAT);
-void matrixTranspose(const int M, const int N,
-                     const float  *A, const int LDA,
-                           float *AT, const int LDAT);
+void RankDecomp3(int  size_x, int  size_y, int  size_z,
+                 int &rank_x, int &rank_y, int &rank_z,
+                 const int rank);
 
-void matrixTranspose(const int M, const int N,
-                     const int  *A, const int LDA,
-                           int *AT, const int LDAT);
-void matrixTranspose(const int M, const int N,
-                     const long long int  *A, const int LDA,
-                           long long int *AT, const int LDAT);
+} //namespace libp
 
 #endif
diff --git a/include/initialGuess.hpp b/include/initialGuess.hpp
index c30539018..5050ba541 100644
--- a/include/initialGuess.hpp
+++ b/include/initialGuess.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Anthony Austin
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,146 +27,136 @@ SOFTWARE.
 #ifndef INITIALGUESS_HPP
 #define INITIALGUESS_HPP
 
-#include "linearSolver.hpp"
+#include "core.hpp"
+#include "platform.hpp"
+#include "solver.hpp"
+
+namespace libp {
+
+namespace InitialGuess {
+
+
+void AddSettings(settings_t& settings, const std::string prefix = "");
 
 // Abstract base class for different initial guess strategies.
 class initialGuessStrategy_t {
-protected:
-  platform_t& platform;
-  settings_t& settings;
-  MPI_Comm   comm;
+ protected:
+  platform_t platform;
+  settings_t settings;
+  comm_t   comm;
 
   dlong Ntotal;     // Degrees of freedom
 
-public:
-  initialGuessStrategy_t(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm);
-  virtual ~initialGuessStrategy_t();
+ public:
+  initialGuessStrategy_t(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm):
+    platform(_platform), settings(_settings), comm(_comm), Ntotal(_N) {}
 
-  virtual void FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs) = 0;
-  virtual void Update(solver_t& solver, occa::memory& o_x, occa::memory& o_rhs) = 0;
+  virtual void FormInitialGuess(deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs) = 0;
+  virtual void Update(operator_t& linearOperator, deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs) = 0;
 };
 
 // Default initial guess strategy:  use whatever the user gave us.
-class igDefaultStrategy : public initialGuessStrategy_t {
+class Default : public initialGuessStrategy_t {
 public:
-  igDefaultStrategy(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm);
+  Default(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm);
 
-  void FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs);
-  void Update(solver_t &solver, occa::memory& o_x, occa::memory& o_rhs);
+  void FormInitialGuess(deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs);
+  void Update(operator_t &linearOperator, deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs);
 };
 
 // Zero initial guess strategy:  use a zero initial guess.
-class igZeroStrategy : public initialGuessStrategy_t {
+class Zero : public initialGuessStrategy_t {
 public:
-  igZeroStrategy(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm);
+  Zero(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm);
 
-  void FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs);
-  void Update(solver_t &solver, occa::memory& o_x, occa::memory& o_rhs);
+  void FormInitialGuess(deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs);
+  void Update(operator_t &linearOperator, deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs);
 };
 
 // Initial guess strategies based on RHS projection.
-class igProjectionStrategy : public initialGuessStrategy_t {
+class Projection : public initialGuessStrategy_t {
 protected:
   dlong curDim;           // Current dimension of the initial guess space
   dlong maxDim;           // Maximum dimension of the initial guess space
 
-  occa::memory o_btilde;  //  vector (e.g., to be added to space)
-  occa::memory o_xtilde;  // Solution vector corresponding to o_btilde
-  occa::memory o_Btilde;  //  space (orthogonalized)
-  occa::memory o_Xtilde;  // Solution space corresponding to  space
+  deviceMemory<dfloat> o_btilde;  //  vector (e.g., to be added to space)
+  deviceMemory<dfloat> o_xtilde;  // Solution vector corresponding to o_btilde
+  deviceMemory<dfloat> o_Btilde;  //  space (orthogonalized)
+  deviceMemory<dfloat> o_Xtilde;  // Solution space corresponding to  space
 
   // temporary buffer for basis inner product output
   dlong        ctmpNblocks;
-  dfloat       *ctmp;
-  occa::memory o_ctmp;
+  pinnedMemory<dfloat> ctmp;
+  deviceMemory<dfloat> o_ctmp;
 
-  dfloat *alphas;         // Buffers for storing inner products.
-  dfloat *alphasThisRank;
-  occa::memory o_alphas;
+  pinnedMemory<dfloat> alphas;    // Buffers for storing inner products.
+  deviceMemory<dfloat> o_alphas;
 
-  occa::kernel igBasisInnerProductsKernel;
-  occa::kernel igReconstructKernel;
-  occa::kernel igScaleKernel;
-  occa::kernel igUpdateKernel;
+  kernel_t igBasisInnerProductsKernel;
+  kernel_t igReconstructKernel;
+  kernel_t igScaleKernel;
+  kernel_t igUpdateKernel;
 
-  void igBasisInnerProducts(occa::memory& o_x, occa::memory& o_Q, occa::memory& o_c, dfloat *c, dfloat *cThisRank);
-  void igReconstruct(occa::memory& o_u, dfloat a, occa::memory& o_c, occa::memory& o_Q, occa::memory& o_unew);
+  void igBasisInnerProducts(deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_c, pinnedMemory<dfloat>& c);
+  void igReconstruct(deviceMemory<dfloat>& o_u, dfloat a, deviceMemory<dfloat>& o_c, deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_unew);
 
 public:
-  igProjectionStrategy(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm);
-  virtual ~igProjectionStrategy();
+  Projection(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm);
 
-  virtual void FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs);
-  virtual void Update(solver_t& solver, occa::memory& o_x, occa::memory& o_rhs) = 0;
+  virtual void FormInitialGuess(deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs);
+  virtual void Update(operator_t& linearOperator, deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs) = 0;
 };
 
 // "Classic" initial guess strategy from Fischer's 1998 paper.
-class igClassicProjectionStrategy : public igProjectionStrategy {
+class ClassicProjection : public Projection {
 public:
-  igClassicProjectionStrategy(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm);
+  ClassicProjection(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm);
 
-  void Update(solver_t &solver, occa::memory& o_x, occa::memory& o_rhs);
+  void Update(operator_t &linearOperator, deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs);
 };
 
 // Rolling QR update for projection history space a la Christensen's thesis.
-class igRollingQRProjectionStrategy : public igProjectionStrategy {
+class RollingQRProjection : public Projection {
 private:
-  dfloat       *R;   // R factor in QR decomposition (row major)
-  occa::memory o_R;
+  pinnedMemory<dfloat>   R;   // R factor in QR decomposition (row major)
+  deviceMemory<dfloat> o_R;
 
-  occa::kernel igDropQRFirstColumnKernel;
+  kernel_t igDropQRFirstColumnKernel;
 
-	void givensRotation(dfloat a, dfloat b, dfloat *c, dfloat *s);
+	void givensRotation(dfloat a, dfloat b, dfloat& c, dfloat& s);
 
 public:
-  igRollingQRProjectionStrategy(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm);
-  ~igRollingQRProjectionStrategy();
+  RollingQRProjection(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm);
 
-  void Update(solver_t &solver, occa::memory& o_x, occa::memory& o_rhs);
+  void Update(operator_t &linearOperator, deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs);
 };
 
 // Extrapolation initial guess strategy.
-class igExtrapStrategy : public initialGuessStrategy_t {
+class Extrap : public initialGuessStrategy_t {
 private:
   int Nhistory;
   int shift;
   int entry;
-  occa::memory o_xh;
-  occa::memory o_coeffs;
-  occa::kernel igExtrapKernel;
-  occa::kernel igExtrapSparseKernel;
+  deviceMemory<dfloat> o_xh;
+  deviceMemory<dfloat> o_coeffs;
+  kernel_t igExtrapKernel;
+  kernel_t igExtrapSparseKernel;
 
   int Nsparse;
-  occa::memory o_sparseIds;
-  occa::memory o_sparseCoeffs;
+  deviceMemory<int> o_sparseIds;
+  deviceMemory<dfloat> o_sparseCoeffs;
 
-  void extrapCoeffs(int m, int M, dfloat *c);
+  void extrapCoeffs(int m, int M, memory<dfloat> c);
 
 public:
-  igExtrapStrategy(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm);
+  Extrap(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm);
 
-  void FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs);
-  void Update(solver_t &solver, occa::memory& o_x, occa::memory& o_rhs);
+  void FormInitialGuess(deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs);
+  void Update(operator_t &linearOperator, deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs);
 };
 
-// Linear solver with successive-RHS initial-guess generation.
-class initialGuessSolver_t : public linearSolver_t {
-protected:
-  initialGuessStrategy_t* igStrategy;   // The initial guess strategy.
-  linearSolver_t*         linearSolver; // The linearSolver_t that does the solve.
-
-public:
-  initialGuessSolver_t(dlong _N, dlong _Nhalo, platform_t& _platform, settings_t& _settings, MPI_Comm _comm);
-  ~initialGuessSolver_t();
-
-  static initialGuessSolver_t* Setup(dlong _N, dlong _Nhalo,
-                                     platform_t& platform, settings_t& settings, MPI_Comm _comm);
-
-  int Solve(solver_t& solver, precon_t& precon,
-            occa::memory& o_x, occa::memory& o_rhs,
-            const dfloat tol, const int MAXIT, const int verbose);
-};
+} //namespace InitialGuess
 
-void initialGuessAddSettings(settings_t& settings, const string prefix = "");
+} //namespace libp
 
 #endif /* INITIALGUESS_HPP */
diff --git a/include/linAlg.hpp b/include/linAlg.hpp
index 93fede004..71ed4d3c8 100644
--- a/include/linAlg.hpp
+++ b/include/linAlg.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -28,126 +28,187 @@ SOFTWARE.
 #define LINALG_HPP
 
 #include "core.hpp"
+#include "memory.hpp"
 
-using std::vector;
-using std::string;
+namespace libp {
 
 class platform_t;
 
 //launcher for basic linear algebra OCCA kernels
 class linAlg_t {
-public:
-  platform_t *platform;
-  occa::properties kernelInfo;
-
-  int blocksize;
-
-  //scratch space for reductions
-  dfloat *scratch;
-  occa::memory h_scratch;
-  occa::memory o_scratch;
-
+ public:
   linAlg_t();
+  linAlg_t(platform_t *_platform) { Setup(_platform); }
 
   void Setup(platform_t *_platform);
 
   //initialize list of kernels
-  void InitKernels(vector<string> kernels);
-
-  ~linAlg_t();
+  void InitKernels(std::vector<std::string> kernels);
 
   /*********************/
   /* vector operations */
   /*********************/
 
   // o_a[n] = alpha
-  void set(const dlong N, const dfloat alpha, occa::memory& o_a);
+  void set(const dlong N, const dfloat alpha, deviceMemory<dfloat> o_a);
 
   // o_a[n] *= alpha
-  void scale(const dlong N, const dfloat alpha, occa::memory& o_a);
+  void scale(const dlong N, const dfloat alpha, deviceMemory<dfloat> o_a);
 
   // o_a[n] += alpha
-  void add(const dlong N, const dfloat alpha, occa::memory& o_a);
+  void add(const dlong N, const dfloat alpha, deviceMemory<dfloat> o_a);
 
   // o_y[n] = beta*o_y[n] + alpha*o_x[n]
-  void axpy(const dlong N, const dfloat alpha, occa::memory& o_x,
-                           const dfloat beta,  occa::memory& o_y);
+  void axpy(const dlong N, const dfloat alpha, deviceMemory<dfloat> o_x,
+                           const dfloat beta,  deviceMemory<dfloat> o_y);
 
   // o_z[n] = beta*o_y[n] + alpha*o_x[n]
-  void zaxpy(const dlong N, const dfloat alpha, occa::memory& o_x,
-                            const dfloat beta,  occa::memory& o_y,
-                            occa::memory& o_z);
+  void zaxpy(const dlong N, const dfloat alpha, deviceMemory<dfloat> o_x,
+                            const dfloat beta,  deviceMemory<dfloat> o_y,
+                            deviceMemory<dfloat> o_z);
 
   // o_x[n] = alpha*o_a[n]*o_x[n]
   void amx(const dlong N, const dfloat alpha,
-           occa::memory& o_a, occa::memory& o_x);
+           deviceMemory<dfloat> o_a, deviceMemory<dfloat> o_x);
 
   // o_y[n] = alpha*o_a[n]*o_x[n] + beta*o_y[n]
   void amxpy(const dlong N, const dfloat alpha,
-             occa::memory& o_a, occa::memory& o_x,
-             const dfloat beta, occa::memory& o_y);
+             deviceMemory<dfloat> o_a, deviceMemory<dfloat> o_x,
+             const dfloat beta, deviceMemory<dfloat> o_y);
 
   // o_z[n] = alpha*o_a[n]*o_x[n] + beta*o_y[n]
   void zamxpy(const dlong N, const dfloat alpha,
-              occa::memory& o_a, occa::memory& o_x,
-              const dfloat beta, occa::memory& o_y, occa::memory& o_z);
+              deviceMemory<dfloat> o_a, deviceMemory<dfloat> o_x,
+              const dfloat beta, deviceMemory<dfloat> o_y, deviceMemory<dfloat> o_z);
 
   // o_x[n] = alpha*o_x[n]/o_a[n]
   void adx(const dlong N, const dfloat alpha,
-           occa::memory& o_a, occa::memory& o_x);
+           deviceMemory<dfloat> o_a, deviceMemory<dfloat> o_x);
 
   // o_y[n] = alpha*o_x[n]/o_a[n] + beta*o_y[n]
   void adxpy(const dlong N, const dfloat alpha,
-             occa::memory& o_a, occa::memory& o_x,
-             const dfloat beta, occa::memory& o_y);
+             deviceMemory<dfloat> o_a, deviceMemory<dfloat> o_x,
+             const dfloat beta, deviceMemory<dfloat> o_y);
 
   // o_z[n] = alpha*o_x[n]/o_a[n] + beta*o_y[n]
   void zadxpy(const dlong N, const dfloat alpha,
-              occa::memory& o_a, occa::memory& o_x,
-              const dfloat beta, occa::memory& o_y, occa::memory& o_z);
+              deviceMemory<dfloat> o_a, deviceMemory<dfloat> o_x,
+              const dfloat beta, deviceMemory<dfloat> o_y, deviceMemory<dfloat> o_z);
 
   // \min o_a
-  dfloat min(const dlong N, occa::memory& o_a, MPI_Comm comm);
+  dfloat min(const dlong N, deviceMemory<dfloat> o_a, comm_t comm);
 
   // \max o_a
-  dfloat max(const dlong N, occa::memory& o_a, MPI_Comm comm);
+  dfloat max(const dlong N, deviceMemory<dfloat> o_a, comm_t comm);
 
   // \sum o_a
-  dfloat sum(const dlong N, occa::memory& o_a, MPI_Comm comm);
+  dfloat sum(const dlong N, deviceMemory<dfloat> o_a, comm_t comm);
 
   // ||o_a||_2
-  dfloat norm2(const dlong N, occa::memory& o_a, MPI_Comm comm);
+  dfloat norm2(const dlong N, deviceMemory<dfloat> o_a, comm_t comm);
 
   // o_x.o_y
-  dfloat innerProd(const dlong N, occa::memory& o_x, occa::memory& o_y,
-                    MPI_Comm comm);
+  dfloat innerProd(const dlong N, deviceMemory<dfloat> o_x, deviceMemory<dfloat> o_y,
+                    comm_t comm);
 
   // ||o_a||_w2
-  dfloat weightedNorm2(const dlong N, occa::memory& o_w, occa::memory& o_a,
-                       MPI_Comm comm);
+  dfloat weightedNorm2(const dlong N, deviceMemory<dfloat> o_w, deviceMemory<dfloat> o_a,
+                       comm_t comm);
 
   // o_w.o_x.o_y
-  dfloat weightedInnerProd(const dlong N, occa::memory& o_w, occa::memory& o_x,
-                            occa::memory& o_y, MPI_Comm comm);
-
-  occa::kernel setKernel;
-  occa::kernel addKernel;
-  occa::kernel scaleKernel;
-  occa::kernel axpyKernel;
-  occa::kernel zaxpyKernel;
-  occa::kernel amxKernel;
-  occa::kernel amxpyKernel;
-  occa::kernel zamxpyKernel;
-  occa::kernel adxKernel;
-  occa::kernel adxpyKernel;
-  occa::kernel zadxpyKernel;
-  occa::kernel minKernel;
-  occa::kernel maxKernel;
-  occa::kernel sumKernel;
-  occa::kernel norm2Kernel;
-  occa::kernel weightedNorm2Kernel;
-  occa::kernel innerProdKernel;
-  occa::kernel weightedInnerProdKernel;
+  dfloat weightedInnerProd(const dlong N, deviceMemory<dfloat> o_w, deviceMemory<dfloat> o_x,
+                            deviceMemory<dfloat> o_y, comm_t comm);
+
+  static void matrixRightSolve(const int NrowsA, const int NcolsA, const memory<double> A,
+                               const int NrowsB, const int NcolsB, const memory<double> B,
+                               memory<double> C);
+  static void matrixRightSolve(const int NrowsA, const int NcolsA, const memory<float> A,
+                               const int NrowsB, const int NcolsB, const memory<float> B,
+                               memory<float> C);
+  static void matrixUnderdeterminedRightSolveMinNorm(const int NrowsA, const int NcolsA,
+                                                     const memory<double> A,
+                                                     const memory<double> b,
+                                                     memory<double> x);
+  static void matrixUnderdeterminedRightSolveMinNorm(const int NrowsA, const int NcolsA,
+                                                     const memory<float> A,
+                                                     const memory<float> b,
+                                                     memory<float> x);
+  static void matrixUnderdeterminedRightSolveCPQR(const int NrowsA, const int NcolsA,
+                                                  const memory<double> A,
+                                                  const memory<double> b,
+                                                  memory<double> x);
+  static void matrixUnderdeterminedRightSolveCPQR(const int NrowsA, const int NcolsA,
+                                                  const memory<float> A,
+                                                  const memory<float> b,
+                                                  memory<float> x);
+
+  static void matrixEigenVectors(const int N, const memory<double> A,
+                                 memory<double> VR, memory<double> WR, memory<double> WI);
+  static void matrixEigenVectors(const int N, const memory<float> A,
+                                 memory<float> VR, memory<float> WR, memory<float> WI);
+
+  static void matrixEigenValues(const int N, const memory<double> A,
+                                memory<double> WR, memory<double> WI);
+  static void matrixEigenValues(const int N, const memory<float> A,
+                                memory<float> WR, memory<float> WI);
+
+  static void matrixInverse(const int N, memory<double> A);
+  static void matrixInverse(const int N, memory<float> A);
+
+  static double matrixConditionNumber(const int N, const memory<double> A);
+  static float  matrixConditionNumber(const int N, const memory<float> A);
+
+  static void matrixTranspose(const int M, const int N,
+                             const memory<double> A, const int LDA,
+                             memory<double> AT, const int LDAT);
+  static void matrixTranspose(const int M, const int N,
+                             const memory<float> A, const int LDA,
+                             memory<float> AT, const int LDAT);
+
+  static void matrixTranspose(const int M, const int N,
+                              const memory<int> A, const int LDA,
+                              memory<int> AT, const int LDAT);
+  static void matrixTranspose(const int M, const int N,
+                              const memory<long long int>  A, const int LDA,
+                              memory<long long int> AT, const int LDAT);
+
+ private:
+  platform_t *platform;
+  properties_t kernelInfo;
+
+  static constexpr int blocksize = 256;
+
+  //scratch space for reductions
+  deviceMemory<dfloat> o_scratch;
+  pinnedMemory<dfloat> h_scratch;
+
+  kernel_t setKernel;
+  kernel_t addKernel;
+  kernel_t scaleKernel;
+  kernel_t axpyKernel;
+  kernel_t zaxpyKernel;
+  kernel_t amxKernel;
+  kernel_t amxpyKernel;
+  kernel_t zamxpyKernel;
+  kernel_t adxKernel;
+  kernel_t adxpyKernel;
+  kernel_t zadxpyKernel;
+  kernel_t minKernel1;
+  kernel_t minKernel2;
+  kernel_t maxKernel1;
+  kernel_t maxKernel2;
+  kernel_t sumKernel1;
+  kernel_t sumKernel2;
+  kernel_t norm2Kernel1;
+  kernel_t norm2Kernel2;
+  kernel_t weightedNorm2Kernel1;
+  kernel_t weightedNorm2Kernel2;
+  kernel_t innerProdKernel1;
+  kernel_t innerProdKernel2;
+  kernel_t weightedInnerProdKernel1;
+  kernel_t weightedInnerProdKernel2;
 };
 
+} //namespace libp
+
 #endif
diff --git a/include/linearSolver.hpp b/include/linearSolver.hpp
index 30f431008..84e2f0a62 100644
--- a/include/linearSolver.hpp
+++ b/include/linearSolver.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Anthony Austin
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -31,162 +31,190 @@ SOFTWARE.
 #include "platform.hpp"
 #include "solver.hpp"
 #include "precon.hpp"
+#include "initialGuess.hpp"
 
-//virtual base linear solver class
+namespace libp {
+
+namespace LinearSolver { class linearSolverBase_t; }
+
+/* General LinearSolver object*/
 class linearSolver_t {
+ public:
+  linearSolver_t() = default;
+
+  /*Generic setup. Create a Solver object and wrap it in a shared_ptr*/
+  template<class Solver, class... Args>
+  void Setup(Args&& ... args) {
+    ls = std::make_shared<Solver>(args...);
+
+    /*Make an initial guess strategy if we dont have one setup yet*/
+    if (ig==nullptr) {
+      MakeDefaultInitialGuessStrategy();
+    }
+  }
+
+  /*Generic setup. Create a InitialGuess object and wrap it in a shared_ptr*/
+  template<class InitialGuess, class... Args>
+  void SetupInitialGuess(Args&& ... args) {
+    ig = std::make_shared<InitialGuess>(args...);
+  }
+
+  int Solve(operator_t& linearOperator, operator_t& precon,
+            deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs,
+            const dfloat tol, const int MAXIT, const int verbose);
+
+ private:
+  std::shared_ptr<LinearSolver::linearSolverBase_t> ls=nullptr;
+  std::shared_ptr<InitialGuess::initialGuessStrategy_t> ig=nullptr;
+
+  void MakeDefaultInitialGuessStrategy();
+
+  void assertInitialized();
+};
+
+
+namespace LinearSolver {
+
+//virtual base linear solver class
+class linearSolverBase_t {
 public:
-  platform_t& platform;
-  settings_t& settings;
-  MPI_Comm comm;
+  platform_t platform;
+  settings_t settings;
+  comm_t comm;
 
   dlong N;
   dlong Nhalo;
 
-  linearSolver_t(dlong _N, dlong _Nhalo,
-                 platform_t& _platform, settings_t& _settings, MPI_Comm _comm):
+  linearSolverBase_t(dlong _N, dlong _Nhalo,
+                 platform_t& _platform, settings_t& _settings, comm_t _comm):
     platform(_platform), settings(_settings), comm(_comm),
     N(_N), Nhalo(_Nhalo) {}
 
-  static linearSolver_t* Setup(dlong _N, dlong _Nhalo,
-                               platform_t& platform, settings_t& settings, MPI_Comm _comm);
-
-  virtual int Solve(solver_t& solver, precon_t& precon,
-                    occa::memory& o_x, occa::memory& o_rhs,
+  virtual int Solve(operator_t& linearOperator, operator_t& precon,
+                    deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs,
                     const dfloat tol, const int MAXIT, const int verbose)=0;
-
-  virtual ~linearSolver_t(){}
 };
 
 //Preconditioned Conjugate Gradient
-class pcg: public linearSolver_t {
+class pcg: public linearSolverBase_t {
 private:
-  occa::memory o_p, o_Ap, o_z, o_Ax;
+  deviceMemory<dfloat> o_p, o_Ap, o_z, o_Ax;
 
-  dfloat* tmprdotr;
-  occa::memory h_tmprdotr;
-  occa::memory o_tmprdotr;
+  pinnedMemory<dfloat> rdotr;
+  deviceMemory<dfloat> o_rdotr;
 
   int flexible;
 
-  occa::kernel updatePCGKernel;
+  kernel_t updatePCGKernel;
 
-  dfloat UpdatePCG(const dfloat alpha, occa::memory &o_x, occa::memory &o_r);
+  dfloat UpdatePCG(const dfloat alpha, deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_r);
 
 public:
   pcg(dlong _N, dlong _Nhalo,
-       platform_t& _platform, settings_t& _settings, MPI_Comm _comm);
-  ~pcg();
+       platform_t& _platform, settings_t& _settings, comm_t _comm);
 
-  int Solve(solver_t& solver, precon_t& precon,
-            occa::memory& o_x, occa::memory& o_rhs,
+  int Solve(operator_t& linearOperator, operator_t& precon,
+            deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs,
             const dfloat tol, const int MAXIT, const int verbose);
 };
 
 //Preconditioned GMRES
-class pgmres: public linearSolver_t {
+class pgmres: public linearSolverBase_t {
 private:
-  occa::memory *o_V=nullptr;
-  occa::memory o_Ax, o_z, o_r;
+  deviceMemory<dfloat> o_Ax, o_z, o_r;
+  memory<deviceMemory<dfloat>> o_V;
 
   int restart;
 
-  dfloat *H=nullptr, *sn=nullptr, *cs=nullptr, *s=nullptr, *y=nullptr;
+  memory<dfloat> H, sn, cs, s, y;
 
-  void UpdateGMRES(occa::memory& o_x, const int I);
+  void UpdateGMRES(deviceMemory<dfloat>& o_x, const int I);
 
 public:
   pgmres(dlong _N, dlong _Nhalo,
-       platform_t& _platform, settings_t& _settings, MPI_Comm _comm);
-  ~pgmres();
+       platform_t& _platform, settings_t& _settings, comm_t _comm);
 
-  int Solve(solver_t& solver, precon_t& precon,
-            occa::memory& o_x, occa::memory& o_rhs,
+  int Solve(operator_t& linearOperator, operator_t& precon,
+            deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs,
             const dfloat tol, const int MAXIT, const int verbose);
 };
 
 // Preconditioned MINRES
-class pminres : public linearSolver_t {
+class pminres : public linearSolverBase_t {
 private:
-  occa::memory o_p;
-  occa::memory o_z;
-  occa::memory o_r;
-  occa::memory o_r_old;
-  occa::memory o_q;
-  occa::memory o_q_old;
+  deviceMemory<dfloat> o_p;
+  deviceMemory<dfloat> o_z;
+  deviceMemory<dfloat> o_r;
+  deviceMemory<dfloat> o_r_old;
+  deviceMemory<dfloat> o_q;
+  deviceMemory<dfloat> o_q_old;
 
-  occa::kernel updateMINRESKernel;
+  kernel_t updateMINRESKernel;
 
-  dfloat innerProd(occa::memory& o_x, occa::memory& o_y);
+  dfloat innerProd(deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_y);
   void UpdateMINRES(const dfloat ma2, const dfloat ma3, const dfloat alpha, const dfloat beta);
 
 public:
   pminres(dlong _N, dlong _Nhalo,
-         platform_t& _platform, settings_t& _settings, MPI_Comm _comm);
-  ~pminres();
+         platform_t& _platform, settings_t& _settings, comm_t _comm);
 
-  int Solve(solver_t& solver, precon_t& precon,
-            occa::memory& o_x, occa::memory& o_rhs,
+  int Solve(operator_t& linearOperator, operator_t& precon,
+            deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs,
             const dfloat tol, const int MAXIT, const int verbose);
 };
 
 //Non-Blocking Preconditioned Conjugate Gradient
-class nbpcg: public linearSolver_t {
+class nbpcg: public linearSolverBase_t {
 private:
-  occa::memory o_p, o_s, o_S, o_z, o_Z, o_Ax;
+  deviceMemory<dfloat> o_p, o_s, o_S, o_z, o_Z, o_Ax;
 
-  dfloat* tmpdots;
-  occa::memory h_tmpdots;
-  occa::memory o_tmpdots;
+  pinnedMemory<dfloat> dots;
+  deviceMemory<dfloat> o_dots;
 
-  occa::kernel update1NBPCGKernel;
-  occa::kernel update2NBPCGKernel;
+  kernel_t update1NBPCGKernel;
+  kernel_t update2NBPCGKernel;
 
-  dfloat *localdots, *globaldots;
-
-  MPI_Request request;
-  MPI_Status  status;
+  Comm::request_t request;
 
   void Update1NBPCG(const dfloat beta);
-  void Update2NBPCG(const dfloat alpha, occa::memory &o_r);
+  void Update2NBPCG(const dfloat alpha, deviceMemory<dfloat>& o_r);
 
 public:
   nbpcg(dlong _N, dlong _Nhalo,
-       platform_t& _platform, settings_t& _settings, MPI_Comm _comm);
-  ~nbpcg();
+       platform_t& _platform, settings_t& _settings, comm_t _comm);
 
-  int Solve(solver_t& solver, precon_t& precon,
-            occa::memory& o_x, occa::memory& o_rhs,
+  int Solve(operator_t& linearOperator, operator_t& precon,
+            deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs,
             const dfloat tol, const int MAXIT, const int verbose);
 };
 
 //Non-Blocking Flexible Preconditioned Conjugate Gradient
-class nbfpcg: public linearSolver_t {
+class nbfpcg: public linearSolverBase_t {
 private:
-  occa::memory o_u, o_p, o_w, o_n, o_m, o_s, o_z, o_q, o_Ax;
-
-  dfloat* tmpdots;
-  occa::memory h_tmpdots;
-  occa::memory o_tmpdots;
+  deviceMemory<dfloat> o_u, o_p, o_w, o_n, o_m, o_s, o_z, o_q, o_Ax;
 
-  occa::kernel update0NBFPCGKernel;
-  occa::kernel update1NBFPCGKernel;
+  pinnedMemory<dfloat> dots;
+  deviceMemory<dfloat> o_dots;
 
-  dfloat *localdots, *globaldots;
+  kernel_t update0NBFPCGKernel;
+  kernel_t update1NBFPCGKernel;
 
-  MPI_Request request;
-  MPI_Status  status;
+  Comm::request_t request;
 
-  void Update0NBFPCG(occa::memory &o_r);
-  void Update1NBFPCG(const dfloat alpha, occa::memory &o_x, occa::memory &o_r);
+  void Update0NBFPCG(deviceMemory<dfloat>& o_r);
+  void Update1NBFPCG(const dfloat alpha, deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_r);
 
 public:
   nbfpcg(dlong _N, dlong _Nhalo,
-       platform_t& _platform, settings_t& _settings, MPI_Comm _comm);
-  ~nbfpcg();
+       platform_t& _platform, settings_t& _settings, comm_t _comm);
 
-  int Solve(solver_t& solver, precon_t& precon,
-            occa::memory& o_x, occa::memory& o_rhs,
+  int Solve(operator_t& linearOperator, operator_t& precon,
+            deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs,
             const dfloat tol, const int MAXIT, const int verbose);
 };
 
+} //namespace LinearSolver
+
+} //namespace libp
+
 #endif
diff --git a/include/memory.hpp b/include/memory.hpp
new file mode 100644
index 000000000..7232c897e
--- /dev/null
+++ b/include/memory.hpp
@@ -0,0 +1,778 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#ifndef LIBP_MEMORY_HPP
+#define LIBP_MEMORY_HPP
+
+#include "utils.hpp"
+
+namespace libp {
+
+template<typename T>
+class memory {
+  template <typename U> friend class memory;
+
+ private:
+  using size_t = std::size_t;
+  using ptrdiff_t = std::ptrdiff_t;
+
+  std::shared_ptr<T[]> shrdPtr;
+  size_t lngth;
+  size_t offset;
+
+ public:
+  memory() :
+    lngth{0},
+    offset{0} {}
+
+  memory(const size_t lngth_) :
+    shrdPtr(new T[lngth_]),
+    lngth{lngth_},
+    offset{0} {}
+
+  memory(const size_t lngth_,
+         const T val) :
+    shrdPtr(new T[lngth_]),
+    lngth{lngth_},
+    offset{0} {
+    #pragma omp parallel for
+    for (size_t i=0;i<lngth;++i) {
+      shrdPtr[i] = val;
+    }
+  }
+
+  /*Conversion constructor*/
+  template<typename U>
+  memory(const memory<U> &m):
+    shrdPtr{std::reinterpret_pointer_cast<T[]>(m.shrdPtr)},
+    lngth{m.lngth*sizeof(T)/sizeof(U)},
+    offset{m.offset*sizeof(T)/sizeof(U)} {
+    // Check that this conversion made sense
+    LIBP_ABORT("libp::memory type conversion failed. Trying to convert "
+                << m.lngth << " " << sizeof(T) << "-byte words to "
+                << lngth << " " << sizeof(U) << "-byte words.",
+                lngth*sizeof(U) != m.lngth*sizeof(T));
+
+    LIBP_ABORT("libp::memory type conversion failed. Source memory has offset at "
+               << m.lngth << " " << sizeof(T) << "-byte words, destination memory would have offset at"
+               << lngth << " " << sizeof(U) << "-byte words.",
+               offset*sizeof(U) != m.offset*sizeof(T));
+  }
+
+  memory(const memory<T> &m)=default;
+  memory& operator = (const memory<T> &m)=default;
+  ~memory()=default;
+
+  void malloc(const size_t lngth_) {
+    *this = memory<T>(lngth_);
+  }
+
+  void malloc(const size_t lngth_, const T val) {
+    *this = memory<T>(lngth_, val);
+  }
+
+  void calloc(const size_t lngth_) {
+    *this = memory<T>(lngth_, T{0});
+  }
+
+  void realloc(const size_t lngth_) {
+    memory<T> m(lngth_);
+    const ptrdiff_t cnt = std::min(lngth, lngth_);
+    m.copyFrom(*this, cnt);
+    *this = m;
+  }
+
+  memory& swap(memory<T> &m) {
+    std::swap(shrdPtr, m.shrdPtr);
+    std::swap(lngth, m.lngth);
+    std::swap(offset, m.offset);
+    return *this;
+  }
+
+  T* ptr() {
+    return shrdPtr.get()+offset;
+  }
+  const T* ptr() const {
+    return shrdPtr.get()+offset;
+  }
+
+  T* begin() {return ptr();}
+  T* end() {return ptr() + length();}
+
+  size_t length() const {
+    return lngth;
+  }
+
+  size_t size() const {
+    return lngth*sizeof(T);
+  }
+
+  size_t use_count() const {
+    return shrdPtr.use_count();
+  }
+
+  T& operator[](const ptrdiff_t idx) const {
+    return shrdPtr[idx+offset];
+  }
+
+  bool operator == (const memory<T> &other) const {
+    return (shrdPtr==other.shrdPtr && offset==other.offset);
+  }
+  bool operator != (const memory<T> &other) const {
+    return (shrdPtr!=other.shrdPtr || offset!=other.offset);
+  }
+
+  memory<T> operator + (const ptrdiff_t offset_) const {
+    return slice(offset_);
+  }
+  memory<T>& operator += (const ptrdiff_t offset_) {
+    *this = slice(offset_);
+    return *this;
+  }
+
+  memory<T> slice(const ptrdiff_t offset_,
+                  const ptrdiff_t count = -1) const {
+    memory<T> m(*this);
+    m.offset = offset + offset_;
+    m.lngth = (count==-1)
+                ? (lngth - offset_)
+                : count;
+    return m;
+  }
+
+  /*Copy from raw ptr*/
+  void copyFrom(const T* src,
+                const ptrdiff_t count = -1,
+                const ptrdiff_t offset_ = 0) {
+
+    const ptrdiff_t cnt = (count==-1) ? lngth : count;
+
+    LIBP_ABORT("libp::memory::copyFrom Cannot have negative count ("
+               << cnt << ")",
+               cnt < 0);
+    LIBP_ABORT("libp::memory::copyFrom Cannot have negative offset ("
+               << offset_ << ")",
+               offset_ < 0);
+    LIBP_ABORT("libp::memory::copyFrom Destination memory has size [" << lngth << "],"
+               << " trying to access [" << offset_ << ", " << offset_+static_cast<size_t>(cnt) << "]",
+               static_cast<size_t>(cnt)+offset_ > lngth);
+
+    std::copy(src,
+              src+cnt,
+              ptr()+offset_);
+  }
+
+  /*Copy from memory*/
+  void copyFrom(const memory<T> src,
+                const ptrdiff_t count = -1,
+                const ptrdiff_t offset_ = 0) {
+    const ptrdiff_t cnt = (count==-1) ? lngth : count;
+
+    LIBP_ABORT("libp::memory::copyFrom Cannot have negative count ("
+               << cnt << ")",
+               cnt < 0);
+    LIBP_ABORT("libp::memory::copyFrom Cannot have negative offset ("
+               << offset_ << ")",
+               offset_ < 0);
+    LIBP_ABORT("libp::memory::copyFrom Source memory has size [" << src.length() << "],"
+               << " trying to access [0, " << static_cast<size_t>(cnt) << "]",
+               static_cast<size_t>(cnt) > src.length());
+    LIBP_ABORT("libp::memory::copyFrom Destination memory has size [" << lngth << "],"
+               << " trying to access [" << offset_ << ", " << offset_+static_cast<size_t>(cnt) << "]",
+               static_cast<size_t>(cnt)+offset_ > lngth);
+
+    std::copy(src.ptr(),
+              src.ptr()+cnt,
+              ptr()+offset_);
+  }
+
+  /*Copy to raw pointer*/
+  void copyTo(T *dest,
+              const ptrdiff_t count = -1,
+              const ptrdiff_t offset_ = 0) const {
+    const ptrdiff_t cnt = (count==-1) ? lngth : count;
+
+    LIBP_ABORT("libp::memory::copyTo Cannot have negative count ("
+               << cnt << ")",
+               cnt < 0);
+    LIBP_ABORT("libp::memory::copyTo Cannot have negative offset ("
+               << offset_ << ")",
+               offset_ < 0);
+    LIBP_ABORT("libp::memory::copyTo Source memory has size [" << lngth << "],"
+               << " trying to access [" << offset_ << ", " << offset_+static_cast<size_t>(cnt) << "]",
+               static_cast<size_t>(cnt)+offset_ > lngth);
+
+    std::copy(ptr()+offset_,
+              ptr()+offset_+cnt,
+              dest);
+  }
+
+  /*Copy to memory*/
+  void copyTo(memory<T> dest,
+              const ptrdiff_t count = -1,
+              const ptrdiff_t offset_ = 0) const {
+    const ptrdiff_t cnt = (count==-1) ? lngth : count;
+
+    LIBP_ABORT("libp::memory::copyTo Cannot have negative count ("
+               << cnt << ")",
+               cnt < 0);
+    LIBP_ABORT("libp::memory::copyTo Cannot have negative offset ("
+               << offset_ << ")",
+               offset_ < 0);
+    LIBP_ABORT("libp::memory::copyTo Destination memory has size [" << dest.length() << "],"
+               << " trying to access [0, " << cnt << "]",
+               static_cast<size_t>(cnt) > dest.length());
+    LIBP_ABORT("libp::memory::copyTo Source memory has size [" << lngth << "],"
+               << " trying to access [" << offset_ << ", " << offset_+static_cast<size_t>(cnt) << "]",
+               static_cast<size_t>(cnt)+offset_ > lngth);
+
+    std::copy(ptr()+offset_,
+              ptr()+offset_+cnt,
+              dest.ptr());
+  }
+
+  memory<T> clone() const {
+    memory<T> m(lngth);
+    m.copyFrom(*this);
+    return m;
+  }
+
+  void free() {
+    shrdPtr = nullptr;
+    lngth=0;
+    offset=0;
+  }
+};
+
+template <typename T>
+std::ostream& operator << (std::ostream &out,
+                         const memory<T> &memory) {
+  out << "memory - "
+      << "type: " << typeid(T).name() << ", "
+      << "ptr : " << memory.ptr() << ", "
+      << "length : " << memory.length() << ", "
+      << "use_count : " << memory.use_count();
+  return out;
+}
+
+/*Extern declare common instantiations for faster compilation*/
+extern template class memory<int>;
+extern template class memory<long long int>;
+extern template class memory<float>;
+extern template class memory<double>;
+
+/*libp::deviceMemory is a wrapper around occa::memory*/
+template<typename T>
+class deviceMemory: public occa::memory {
+ public:
+  deviceMemory() = default;
+  deviceMemory(const deviceMemory<T> &m)=default;
+  deviceMemory(occa::memory m):
+    occa::memory(m)
+  {
+    if (isInitialized()) {
+      if (occa::dtype::get<T>() == occa::dtype::none) {
+        occa::memory::setDtype(occa::dtype::byte);
+      } else {
+        occa::memory::setDtype(occa::dtype::get<T>());
+      }
+    }
+  }
+
+  /*Conversion constructor*/
+  template<typename U>
+  deviceMemory(const deviceMemory<U> &m):
+    occa::memory(m)
+  {
+    if (isInitialized()) {
+      if (occa::dtype::get<T>() == occa::dtype::none) {
+        occa::memory::setDtype(occa::dtype::byte);
+      } else {
+        occa::memory::setDtype(occa::dtype::get<T>());
+      }
+    }
+  }
+
+  deviceMemory<T>& operator = (const deviceMemory<T> &m)=default;
+  ~deviceMemory()=default;
+
+  T* ptr() {
+    return static_cast<T*>(occa::memory::ptr());
+  }
+  const T* ptr() const {
+    return static_cast<const T*>(occa::memory::ptr());
+  }
+
+  size_t length() const {
+    return size()/sizeof(T);
+  }
+
+  T& operator[](const ptrdiff_t idx) {
+    return ptr()[idx];
+  }
+
+  deviceMemory<T> operator + (const ptrdiff_t offset) const {
+    if (isInitialized())
+      return deviceMemory<T>(occa::memory::operator+(offset));
+    else
+      return deviceMemory<T>();
+  }
+
+  deviceMemory<T>& operator += (const ptrdiff_t offset) {
+    *this = deviceMemory<T>(occa::memory::slice(offset));
+    return *this;
+  }
+
+  /*Copy from libp::memory*/
+  void copyFrom(const libp::memory<T> src,
+                const ptrdiff_t count = -1,
+                const ptrdiff_t offset = 0,
+                const properties_t &props = properties_t()) {
+    const ptrdiff_t cnt = (count==-1) ? length() : count;
+
+    if (cnt==0) return;
+
+    LIBP_ABORT("libp::memory::copyFrom Source memory has size [" << src.length() << "],"
+               << " trying to access [0, " << static_cast<size_t>(cnt) << "]",
+               static_cast<size_t>(cnt) > src.length());
+
+    occa::memory::copyFrom(src.ptr(),
+                           cnt*sizeof(T),
+                           offset*sizeof(T),
+                           props);
+  }
+
+  void copyFrom(const libp::memory<T> src,
+                const properties_t &props) {
+
+    if (length()==0) return;
+
+    LIBP_ABORT("libp::memory::copyFrom Source memory has size [" << src.length() << "],"
+               << " trying to access [0, " << length() << "]",
+               length() > src.length());
+
+    occa::memory::copyFrom(src.ptr(),
+                           length()*sizeof(T),
+                           0,
+                           props);
+  }
+
+  /*Copy from libp::deviceMemory*/
+  void copyFrom(const deviceMemory<T> src,
+                const ptrdiff_t count = -1,
+                const ptrdiff_t offset = 0,
+                const properties_t &props = properties_t()) {
+    const ptrdiff_t cnt = (count==-1) ? length() : count;
+
+    if (cnt==0) return;
+
+    occa::memory::copyFrom(src,
+                           cnt*sizeof(T),
+                           offset*sizeof(T),
+                           0,
+                           props);
+  }
+
+  void copyFrom(const deviceMemory<T> src,
+                const properties_t &props) {
+
+    if (length()==0) return;
+
+    occa::memory::copyFrom(src,
+                           length()*sizeof(T),
+                           0,
+                           0,
+                           props);
+  }
+
+  /*Copy to libp::memory*/
+  void copyTo(libp::memory<T> dest,
+              const ptrdiff_t count = -1,
+              const ptrdiff_t offset = 0,
+              const properties_t &props = properties_t()) const {
+    const ptrdiff_t cnt = (count==-1) ? length() : count;
+
+    if (cnt==0) return;
+
+    LIBP_ABORT("libp::memory::copyTo Destination memory has size [" << dest.length() << "],"
+               << " trying to access [0, " << static_cast<size_t>(cnt) << "]",
+               static_cast<size_t>(cnt) > dest.length());
+
+    occa::memory::copyTo(dest.ptr(),
+                         cnt*sizeof(T),
+                         offset*sizeof(T),
+                         props);
+  }
+
+  void copyTo(libp::memory<T> dest,
+              const properties_t &props) const {
+
+    if (length()==0) return;
+
+    LIBP_ABORT("libp::memory::copyTo Destination memory has size [" << dest.length() << "],"
+               << " trying to access [0, " << length() << "]",
+               length() > dest.length());
+
+    occa::memory::copyTo(dest.ptr(),
+                         length()*sizeof(T),
+                         0,
+                         props);
+  }
+
+  /*Copy to libp::deviceMemory*/
+  void copyTo(deviceMemory<T> dest,
+              const ptrdiff_t count = -1,
+              const ptrdiff_t offset = 0,
+              const properties_t &props = properties_t()) const {
+    const ptrdiff_t cnt = (count==-1) ? length() : count;
+
+    if (cnt==0) return;
+
+    occa::memory::copyTo(dest,
+                         cnt*sizeof(T),
+                         0,
+                         offset*sizeof(T),
+                         props);
+  }
+
+  void copyTo(deviceMemory<T> dest,
+              const properties_t &props) const {
+
+    if (length()==0) return;
+
+    occa::memory::copyTo(dest,
+                         length()*sizeof(T),
+                         0,
+                         0,
+                         props);
+  }
+};
+
+/*Extern declare common instantiations for faster compilation*/
+extern template class deviceMemory<int>;
+extern template class deviceMemory<long long int>;
+extern template class deviceMemory<float>;
+extern template class deviceMemory<double>;
+
+/*libp::pinnedMemory is another wrapper around occa::memory,
+  but is allocated slightly differently*/
+template<typename T>
+class pinnedMemory: public occa::memory {
+ public:
+  pinnedMemory() = default;
+  pinnedMemory(const pinnedMemory<T> &m)=default;
+  pinnedMemory(occa::memory m):
+    occa::memory(m)
+  {
+    if (isInitialized()) {
+      if (occa::dtype::get<T>() == occa::dtype::none) {
+        occa::memory::setDtype(occa::dtype::byte);
+      } else {
+        occa::memory::setDtype(occa::dtype::get<T>());
+      }
+    }
+  };
+
+  /*Conversion constructor*/
+  template<typename U>
+  pinnedMemory(const pinnedMemory<U> &m):
+    occa::memory(m)
+  {
+    if (isInitialized()) {
+      if (occa::dtype::get<T>() == occa::dtype::none) {
+        occa::memory::setDtype(occa::dtype::byte);
+      } else {
+        occa::memory::setDtype(occa::dtype::get<T>());
+      }
+    }
+  }
+
+  pinnedMemory<T>& operator = (const pinnedMemory<T> &m)=default;
+  ~pinnedMemory()=default;
+
+  T* ptr() {
+    return static_cast<T*>(occa::memory::ptr());
+  }
+  const T* ptr() const {
+    return static_cast<const T*>(occa::memory::ptr());
+  }
+
+  size_t length() const {
+    return size()/sizeof(T);
+  }
+
+  T& operator[](const ptrdiff_t idx) {
+    return ptr()[idx];
+  }
+
+  pinnedMemory<T> operator + (const ptrdiff_t offset) const {
+    if (isInitialized())
+      return pinnedMemory<T>(occa::memory::operator+(offset));
+    else
+      return pinnedMemory<T>();
+  }
+
+  pinnedMemory<T>& operator += (const ptrdiff_t offset) {
+    *this = pinnedMemory<T>(occa::memory::slice(offset));
+    return *this;
+  }
+
+  /*Copy from raw pointer*/
+  void copyFrom(const T* src,
+                const ptrdiff_t count = -1,
+                const ptrdiff_t offset = 0,
+                const properties_t &props = properties_t()) {
+    const ptrdiff_t cnt = (count==-1) ? length() : count;
+
+    if (cnt==0) return;
+
+    occa::memory::copyFrom(src,
+                           cnt*sizeof(T),
+                           offset*sizeof(T),
+                           props);
+  }
+
+  void copyFrom(const T* src,
+                const properties_t &props) {
+
+    if (length()==0) return;
+
+    occa::memory::copyFrom(src,
+                           length()*sizeof(T),
+                           0,
+                           props);
+  }
+
+  /*Copy from libp::memory*/
+  void copyFrom(const libp::memory<T> src,
+                const ptrdiff_t count = -1,
+                const ptrdiff_t offset = 0,
+                const properties_t &props = properties_t()) {
+    const ptrdiff_t cnt = (count==-1) ? length() : count;
+
+    if (cnt==0) return;
+
+    LIBP_ABORT("libp::memory::copyFrom Source memory has size [" << src.length() << "],"
+               << " trying to access [0, " << static_cast<size_t>(cnt) << "]",
+               static_cast<size_t>(cnt) > src.length());
+
+    occa::memory::copyFrom(src.ptr(),
+                           cnt*sizeof(T),
+                           offset*sizeof(T),
+                           props);
+  }
+
+  void copyFrom(const libp::memory<T> src,
+                const properties_t &props) {
+
+    if (length()==0) return;
+
+    LIBP_ABORT("libp::memory::copyFrom Source memory has size [" << src.length() << "],"
+               << " trying to access [0, " << length() << "]",
+               length() > src.length());
+
+    occa::memory::copyFrom(src.ptr(),
+                           length()*sizeof(T),
+                           0,
+                           props);
+  }
+
+  /*Copy from libp::deviceMemory*/
+  void copyFrom(const deviceMemory<T> src,
+                const ptrdiff_t count = -1,
+                const ptrdiff_t offset = 0,
+                const properties_t &props = properties_t()) {
+    const ptrdiff_t cnt = (count==-1) ? length() : count;
+
+    if (cnt==0) return;
+
+    occa::memory::copyFrom(src,
+                           cnt*sizeof(T),
+                           offset*sizeof(T),
+                           0,
+                           props);
+  }
+
+  void copyFrom(const deviceMemory<T> src,
+                const properties_t &props) {
+
+    if (length()==0) return;
+
+    occa::memory::copyFrom(src,
+                           length()*sizeof(T),
+                           0,
+                           0,
+                           props);
+  }
+
+  /*Copy from libp::pinnedMemory*/
+  void copyFrom(const pinnedMemory<T> src,
+                const ptrdiff_t count = -1,
+                const ptrdiff_t offset = 0,
+                const properties_t &props = properties_t()) {
+    const ptrdiff_t cnt = (count==-1) ? length() : count;
+
+    if (cnt==0) return;
+
+    occa::memory::copyFrom(src,
+                           cnt*sizeof(T),
+                           offset*sizeof(T),
+                           0,
+                           props);
+  }
+
+  void copyFrom(const pinnedMemory<T> src,
+                const properties_t &props) {
+
+    if (length()==0) return;
+
+    occa::memory::copyFrom(src,
+                           length()*sizeof(T),
+                           0,
+                           0,
+                           props);
+  }
+
+  /*Copy to raw pointer*/
+  void copyTo(T* dest,
+              const ptrdiff_t count = -1,
+              const ptrdiff_t offset = 0,
+              const properties_t &props = properties_t()) const {
+    const ptrdiff_t cnt = (count==-1) ? length() : count;
+
+    if (cnt==0) return;
+
+    occa::memory::copyTo(dest,
+                         cnt*sizeof(T),
+                         offset*sizeof(T),
+                         props);
+  }
+
+  void copyTo(T* dest,
+              const properties_t &props) const {
+
+    if (length()==0) return;
+
+    occa::memory::copyTo(dest,
+                         length()*sizeof(T),
+                         0,
+                         props);
+  }
+
+  /*Copy to libp::memory*/
+  void copyTo(libp::memory<T> dest,
+              const ptrdiff_t count = -1,
+              const ptrdiff_t offset = 0,
+              const properties_t &props = properties_t()) const {
+    const ptrdiff_t cnt = (count==-1) ? length() : count;
+
+    if (cnt==0) return;
+
+    LIBP_ABORT("libp::memory::copyTo Destination memory has size [" << dest.length() << "],"
+               << " trying to access [0, " << static_cast<size_t>(cnt) << "]",
+               static_cast<size_t>(cnt) > dest.length());
+
+    occa::memory::copyTo(dest.ptr(),
+                         cnt*sizeof(T),
+                         offset*sizeof(T),
+                         props);
+  }
+
+  void copyTo(libp::memory<T> dest,
+              const properties_t &props) const {
+
+    if (length()==0) return;
+
+    LIBP_ABORT("libp::memory::copyTo Destination memory has size [" << dest.length() << "],"
+               << " trying to access [0, " << length() << "]",
+               length() > dest.length());
+
+    occa::memory::copyTo(dest.ptr(),
+                         length()*sizeof(T),
+                         0,
+                         props);
+  }
+
+  /*Copy to libp::deviceMemory*/
+  void copyTo(deviceMemory<T> dest,
+              const ptrdiff_t count = -1,
+              const ptrdiff_t offset = 0,
+              const properties_t &props = properties_t()) const {
+    const ptrdiff_t cnt = (count==-1) ? length() : count;
+
+    if (cnt==0) return;
+
+    occa::memory::copyTo(dest,
+                         cnt*sizeof(T),
+                         0,
+                         offset*sizeof(T),
+                         props);
+  }
+
+  void copyTo(deviceMemory<T> dest,
+              const properties_t &props) const {
+
+    if (length()==0) return;
+
+    occa::memory::copyTo(dest,
+                         length()*sizeof(T),
+                         0,
+                         0,
+                         props);
+  }
+
+  /*Copy to libp::pinnedMemory*/
+  void copyTo(pinnedMemory<T> dest,
+              const ptrdiff_t count = -1,
+              const ptrdiff_t offset = 0,
+              const properties_t &props = properties_t()) const {
+    const ptrdiff_t cnt = (count==-1) ? length() : count;
+
+    if (cnt==0) return;
+
+    occa::memory::copyTo(dest,
+                         cnt*sizeof(T),
+                         0,
+                         offset*sizeof(T),
+                         props);
+  }
+
+  void copyTo(pinnedMemory<T> dest,
+              const properties_t &props) const {
+
+    if (length()==0) return;
+
+    occa::memory::copyTo(dest,
+                         length()*sizeof(T),
+                         0,
+                         0,
+                         props);
+  }
+};
+
+} //namespace libp
+
+#endif
diff --git a/include/mesh.hpp b/include/mesh.hpp
index 4e58b790b..8b1d18ce8 100644
--- a/include/mesh.hpp
+++ b/include/mesh.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -28,494 +28,1076 @@ SOFTWARE.
 #define MESH_HPP 1
 
 #include "core.hpp"
+#include "platform.hpp"
 #include "settings.hpp"
 #include "ogs.hpp"
 
-#define TRIANGLES 3
-#define QUADRILATERALS 4
-#define TETRAHEDRA 6
-#define HEXAHEDRA 12
+namespace libp {
 
 class meshSettings_t: public settings_t {
 public:
-  meshSettings_t(MPI_Comm& _comm);
+  meshSettings_t() = default;
+  meshSettings_t(comm_t _comm);
   void report();
 };
 
-class mesh_t {
-public:
-  platform_t& platform;
-  meshSettings_t& settings;
+namespace Mesh {
+  /*Element types*/
+  enum ElementType {
+    TRIANGLES     =3,
+    QUADRILATERALS=4,
+    TETRAHEDRA    =6,
+    HEXAHEDRA     =12
+  };
+} //namespace Mesh
 
-  occa::properties props;
+class mesh_t {
+ public:
+  platform_t platform;
+  meshSettings_t settings;
+  properties_t props;
 
-  MPI_Comm comm;
+  comm_t comm;
   int rank, size;
 
+  /*************************/
+  /* Element Data          */
+  /*************************/
   int dim;
   int Nverts, Nfaces, NfaceVertices;
+  Mesh::ElementType elementType;
 
   // indices of vertex nodes
-  int *vertexNodes;
-
-  int elementType;
+  memory<int> vertexNodes;
 
   hlong Nnodes=0; //global number of element vertices
-  dfloat *EX; // coordinates of vertices for each element
-  dfloat *EY;
-  dfloat *EZ;
+  memory<dfloat> EX; // coordinates of vertices for each element
+  memory<dfloat> EY;
+  memory<dfloat> EZ;
 
   dlong Nelements=0;       //local element count
   hlong NelementsGlobal=0; //global element count
-  hlong *EToV; // element-to-vertex connectivity
-  dlong *EToE; // element-to-element connectivity
-  int   *EToF; // element-to-(local)face connectivity
-  int   *EToP; // element-to-partition/process connectivity
-  int   *EToB; // element-to-boundary condition type
+  memory<hlong> EToV;      // element-to-vertex connectivity
+  memory<hlong> EToE;      // element-to-element connectivity
+  memory<int>   EToF;      // element-to-(local)face connectivity
+  memory<int>   EToP;      // element-to-partition/process connectivity
+  memory<int>   EToB;      // element-to-boundary condition type
+  deviceMemory<int> o_EToB;
+
+  memory<int>   mapB;      // node-to-boundary condition type
+  deviceMemory<int> o_mapB;
+
+  memory<hlong> elementInfo; //type of element
 
-  hlong *elementInfo; //type of element
+  memory<dlong> VmapM;  // list of vertices on each face
+  memory<dlong> VmapP;  // list of vertices that are paired with face vertices
 
   // boundary faces
   hlong NboundaryFaces=0; // number of boundary faces
-  hlong *boundaryInfo; // list of boundary faces (type, vertex-1, vertex-2, vertex-3)
+  memory<hlong> boundaryInfo; // list of boundary faces (type, vertex-1, vertex-2, vertex-3)
+
+  /*************************/
+  /* FEM Space             */
+  /*************************/
+  int N=0, Np=0;             // N = Polynomial order and Np = Nodes per element
+  memory<dfloat> r, s, t;    // coordinates of local nodes
+
+  int Nq=0;                 // N = Polynomial order, Nq=N+1
+  memory<dfloat> gllz;      // 1D GLL quadrature nodes
+  memory<dfloat> gllw;      // 1D GLL quadrature weights
+
+  // face node info
+  int Nfp=0;                // number of nodes per face
+  memory<int> faceNodes;    // list of element reference interpolation nodes on element faces
+  memory<int> faceVertices; // list of mesh vertices on each face
+
+  /*************************/
+  /* FEM Operators         */
+  /*************************/
+  memory<dfloat> Dr, Ds, Dt;    // collocation differentiation matrices
+  memory<dfloat> D;
+  deviceMemory<dfloat> o_D;
+  memory<dfloat> MM, invMM;     // reference mass matrix
+  deviceMemory<dfloat> o_MM;
+  memory<dfloat> LIFT;          // lift matrix
+  deviceMemory<dfloat> o_LIFT;
+  memory<dfloat> sM;            // surface mass (MM*LIFT)^T
+  deviceMemory<dfloat> o_sM;
+  memory<dfloat> Srr, Srs, Srt; //element stiffness matrices
+  memory<dfloat> Ssr, Sss, Sst;
+  memory<dfloat> Str, Sts, Stt;
+  memory<dfloat> S;
+  deviceMemory<dfloat> o_S;
+
+  /*************************/
+  /* Cubature              */
+  /*************************/
+  // cubature
+  int cubN=0, cubNp=0, cubNfp=0, cubNq=0;
+  memory<dfloat> cubr, cubs, cubt, cubw; // coordinates and weights of local cubature nodes
+
+  memory<dfloat> cubInterp;    // interpolate from W&B to cubature nodes
+  deviceMemory<dfloat> o_cubInterp;
+  memory<dfloat> cubProject;   // projection matrix from cubature nodes to W&B nodes
+  deviceMemory<dfloat> o_cubProject;
+  memory<dfloat> cubD;         // 1D differentiation matrix
+  deviceMemory<dfloat> o_cubD;
+  memory<dfloat> cubPDrT, cubPDsT, cubPDtT;  // weak differentiation matrices
+  memory<dfloat> cubPDT;                     // packed weak differentiation matrices
+  deviceMemory<dfloat> o_cubPDT;
 
+  // surface integration node info
+  int intNfp=0;    // number of integration nodes on each face
+  memory<dfloat> intr, ints, intw;
+  memory<dfloat> intInterp; // interp from surface node to integration nodes
+  deviceMemory<dfloat> o_intInterp;
+  memory<dfloat> intLIFT;   // lift from surface integration nodes to W&B volume nodes
+  deviceMemory<dfloat> o_intLIFT;
+
+  /*************************/
+  /* Plotting              */
+  /*************************/
+  // ploting info for generating field vtu
+  int plotN=0;
+  int plotNq=0;
+  int plotNp=0;
+  int plotNverts;    // number of vertices for each plot element
+  int plotNelements; // number of "plot elements" per element
+  memory<int>   plotEToV;             // triangulation of plot nodes
+  memory<dfloat> plotR, plotS, plotT; // coordinates of plot nodes in reference element
+  memory<dfloat> plotInterp;          // reference to plot node interpolation matrix
+
+  /*************************/
+  /* Physical Space        */
+  /*************************/
+  // volume node info
+  memory<dfloat> x, y, z;    // coordinates of physical nodes
+  deviceMemory<dfloat> o_x, o_y, o_z;    // coordinates of physical nodes
+
+  memory<dlong> vmapM;      // list of volume nodes that are face nodes
+  deviceMemory<dlong> o_vmapM;
+  memory<dlong> vmapP;      // list of volume nodes that are paired with face nodes
+  deviceMemory<dlong> o_vmapP;
+  memory<dlong> mapP;       // list of surface nodes that are paired with -ve surface  nodes
+  deviceMemory<dlong> o_mapP;
+
+  // Jacobian
+  memory<dfloat> wJ;
+  deviceMemory<dfloat> o_wJ;
+  // volumeGeometricFactors;
+  dlong Nvgeo;
+  memory<dfloat> vgeo;
+  deviceMemory<dfloat> o_vgeo;
+  // surfaceGeometricFactors;
+  dlong   Nsgeo;
+  memory<dfloat> sgeo;
+  deviceMemory<dfloat> o_sgeo;
+  // second order volume geometric factors
+  dlong Nggeo;
+  memory<dfloat> ggeo;
+  deviceMemory<dfloat> o_ggeo;
+
+  memory<dfloat> cubx, cuby, cubz; // coordinates of physical nodes
+  deviceMemory<dfloat> o_cubx, o_cuby, o_cubz;
+  memory<dfloat> intx, inty, intz; // coordinates of suface integration nodes
+  deviceMemory<dfloat> o_intx, o_inty, o_intz;
+
+  memory<dfloat> cubwJ;            //Jacobian at cubature points
+  deviceMemory<dfloat> o_cubwJ;
+  memory<dfloat> cubvgeo;          //volume geometric data at cubature points
+  deviceMemory<dfloat> o_cubvgeo;
+  memory<dfloat> cubsgeo;          //surface geometric data at cubature points
+  deviceMemory<dfloat> o_cubsgeo;
+  memory<dfloat> cubggeo;          //second type volume geometric data at cubature points
+  deviceMemory<dfloat> o_cubggeo;
+
+  /*************************/
+  /* MPI Data              */
+  /*************************/
   // MPI halo exchange info
-  halo_t *halo;            // halo exchange pointer
-  halo_t *ringHalo;        // ring halo exchange pointer
+  ogs::halo_t halo;            // halo exchange pointer
+  ogs::halo_t ringHalo;        // ring halo exchange pointer
   dlong NinternalElements=0; // number of elements that can update without halo exchange
   dlong NhaloElements=0;     // number of elements that cannot update without halo exchange
   dlong  totalHaloPairs=0;   // number of elements to be received in halo exchange
   dlong  totalRingElements=0;// number of elements to be received in ring halo exchange
-  dlong *internalElementIds;  // list of elements that can update without halo exchange
-  dlong *haloElementIds;      // list of elements to be sent in halo exchange
-  occa::memory o_internalElementIds;  // list of elements that can update without halo exchange
-  occa::memory o_haloElementIds;      // list of elements to be sent in halo exchange
+
+  memory<dlong> internalElementIds;  // list of elements that can update without halo exchange
+  memory<dlong> haloElementIds;      // list of elements to be sent in halo exchange
+  deviceMemory<dlong> o_internalElementIds;  // list of elements that can update without halo exchange
+  deviceMemory<dlong> o_haloElementIds;      // list of elements to be sent in halo exchange
 
   // CG gather-scatter info
-  ogs_t *ogs;              //occa gs pointer
-  hlong *globalIds;
+  ogs::ogs_t ogs;              //occa gs pointer
+  memory<hlong> globalIds;
 
   // list of elements that are needed for global gather-scatter
-  dlong NglobalGatherElements=0;
-  dlong *globalGatherElementList;
-  occa::memory o_globalGatherElementList;
+  dlong NglobalGatherElements;
+  memory<dlong> globalGatherElementList;
+  deviceMemory<dlong> o_globalGatherElementList;
 
   // list of elements that are not needed for global gather-scatter
-  dlong NlocalGatherElements=0;
-  dlong *localGatherElementList;
-  occa::memory o_localGatherElementList;
-
-  // volumeGeometricFactors;
-  dlong Nvgeo=0;
-  dfloat *vgeo;
-
-  // second order volume geometric factors
-  dlong Nggeo=0;
-  dfloat *ggeo;
-
-  // volume node info
-  int N=0, Nq=0, Np=0;  // N = Polynomial order, Nq=N+1, and Np = Nodes per element
-  dfloat *r, *s, *t;    // coordinates of reference nodes
-  dfloat *w;            // quadrature weights (1d quadrature for tensor prod elements)
-  dfloat *MM, *invMM;   // reference mass matrix
-
-  dfloat *Dr, *Ds, *Dt; // collocation differentiation matrices
-  dfloat *D;            // packed collocation differentiation matrices,
-                        //  or 1D derivative for quads and hexes
-
-  dfloat *Srr,*Srs, *Srt; //element stiffness matrices
-  dfloat *Sss,*Sst, *Stt;
-  dfloat *S;              // packed element stiffness matrices
-
-  dfloat *x, *y, *z;    // coordinates of physical nodes
-
-  /* GeoData for affine mapped elements */
-  /* NC: disabling until we re-add treatment of affine elements
-  dfloat *EXYZ;  // element vertices for reconstructing geofacs
-  dfloat *gllzw; // GLL nodes and weights
-  dfloat *ggeoNoJW;
-  occa::memory o_EXYZ;
-  occa::memory o_gllzw;
-  occa::memory o_ggeoNoJW;
-  */
-
-  // face node info
-  int Nfp=0;         // number of nodes per face
-  int *faceNodes;    // list of element reference interpolation nodes on element faces
-  dlong *vmapM;      // list of volume nodes that are face nodes
-  dlong *vmapP;      // list of volume nodes that are paired with face nodes
-  dlong *mapP;       // list of surface nodes that are paired with -ve surface  nodes
-  int *faceVertices; // list of mesh vertices on each face
-
-  dfloat *LIFT; // lift matrix
-  dfloat *sM;   // surface mass MM*LIFT
-
-  dlong   Nsgeo=0;
-  dfloat *sgeo;
-
-  // cubature
-  int cubN=0, cubNp=0, cubNfp=0, cubNq=0;
-  dfloat *cubr, *cubs, *cubt, *cubw;    // coordinates and weights of reference cubature nodes
-  dfloat *cubx, *cuby, *cubz;           // coordinates of physical cubature nodes
-  dfloat *cubInterp;                    // interpolate from W&B to cubature nodes
-  dfloat *cubProject;                   // projection matrix from cubature nodes to W&B nodes
-  dfloat *cubD;                         // packed differentiation matrices
-  dfloat *cubPDT;                       // packed weak differentiation matrices
-  dfloat *cubPDrT, *cubPDsT, *cubPDtT;  // weak differentiation matrices
-
-  dfloat *cubvgeo;  //volume geometric data at cubature points
-  dfloat *cubsgeo;  //surface geometric data at cubature points
-  dfloat *cubggeo;  //second type volume geometric data at cubature points
-
-  // surface integration node info
-  int    intNfp=0;    // number of integration nodes on each face
-  dfloat *intr, *ints, *intw;
-  dfloat *intInterp; // interp from surface node to integration nodes
-  dfloat *intLIFT;   // lift from surface integration nodes to W&B volume nodes
-  dfloat *intx, *inty, *intz; // coordinates of suface integration nodes
+  dlong NlocalGatherElements;
+  memory<dlong> localGatherElementList;
+  deviceMemory<dlong> o_localGatherElementList;
 
+  /*************************/
+  /* PML                   */
+  /*************************/
   //pml lists
   dlong NnonPmlElements=0;
   dlong NpmlElements=0;
 
-  dlong *pmlElements;
-  dlong *nonPmlElements;
-  dlong *pmlIds;
+  memory<dlong> pmlElements;
+  deviceMemory<dlong> o_pmlElements;
+  memory<dlong> nonPmlElements;
+  deviceMemory<dlong> o_nonPmlElements;
+  memory<dlong> pmlIds;
+  deviceMemory<dlong> o_pmlIds;
 
+
+  /*************************/
+  /* Multirate timestepping*/
+  /*************************/
   //multirate lists
   int mrNlevels=0;
-  int *mrLevel;
-  dlong *mrNelements, *mrInterfaceNelements;
-  dlong **mrElements, **mrInterfaceElements;
+  memory<int> mrLevel;
+  deviceMemory<int> o_mrLevel;
 
-  //multirate pml lists
-  dlong *mrNnonPmlElements, *mrNpmlElements;
-  dlong **mrPmlElements, **mrNonPmlElements;
-  dlong **mrPmlIds;
-
-  // plotting info for generating field vtu
-  int    plotNverts=0;    // number of vertices for each plot element
-  int    plotN=0;         // degree of plot interpolation
-  int    plotNq=0;        // plotNq = plotN+1
-  int    plotNp=0;        // number of plot nodes per element
-  int    plotNelements=0; // number of "plot elements" per element
-  int    *plotEToV;       // triangulation of plot nodes
-  dfloat *plotR, *plotS, *plotT; // coordinates of plot nodes in reference element
-  dfloat *plotInterp;    // reference to plot node interpolation matrix
+  memory<dlong> mrNelements, mrInterfaceNelements;
+  deviceMemory<dlong> o_mrNelements, o_mrInterfaceNelements;
 
+  memory<dlong> mrNnonPmlElements, mrNpmlElements;
+
+  memory<memory<dlong>> mrElements, mrInterfaceElements;
+  memory<deviceMemory<dlong>> o_mrElements, o_mrInterfaceElements;
+
+  //multirate pml lists
+  memory<memory<dlong>> mrPmlElements, mrNonPmlElements;
+  memory<deviceMemory<dlong>> o_mrPmlElements, o_mrNonPmlElements;
+  memory<memory<dlong>> mrPmlIds;
+  memory<deviceMemory<dlong>> o_mrPmlIds;
+
+  /*************************/
+  /* SEMFEM                */
+  /*************************/
   //SEMFEM data
   int NpFEM=0, NelFEM=0;
-  int *FEMEToV;
-  dfloat *rFEM, *sFEM, *tFEM;
-  dfloat *SEMFEMInterp;
+  memory<int> FEMEToV;
+  memory<dfloat> rFEM, sFEM, tFEM;
+  memory<dfloat> SEMFEMInterp;
+  deviceMemory<dfloat> o_SEMFEMInterp;
+  deviceMemory<dfloat> o_SEMFEMAnterp;
 
-  // occa stuff
-  occa::memory o_SEMFEMInterp;
-  occa::memory o_SEMFEMAnterp;
+  kernel_t MassMatrixKernel;
 
-  occa::memory o_MM;  // Mass matrix
-  occa::memory o_D;   // packed differentiation matricies (contains the transpose 1d D matrix for quads/hexes)
-  occa::memory o_S;   // packed stiffness matricies
-  occa::memory o_LIFT;// Surface lift matrix
-  occa::memory o_sM;  // Surface mass
-
-  // volume, surface, and second order geometric factors
-  occa::memory o_vgeo, o_sgeo, o_ggeo;
+  mesh_t() = default;
+  mesh_t(platform_t& _platform, meshSettings_t& _settings,
+         comm_t _comm) {
+    Setup(_platform, _settings, _comm);
+  }
 
-  //face node mappings
-  occa::memory o_vmapM, o_vmapP, o_mapP;
+  // mesh setup
+  void Setup(platform_t& _platform, meshSettings_t& _settings,
+             comm_t _comm);
 
-  //element boundary mappings
-  occa::memory o_EToB;
+  // setup trace halo
+  void HaloRingSetup();
 
-  //physical coordinates
-  occa::memory o_x, o_y, o_z;
+  // setup trace halo
+  ogs::halo_t HaloTraceSetup(int Nfields);
 
-  // cubature
-  occa::memory o_cubInterp, o_cubProject; //cubature interpolationm and projection
-  occa::memory o_cubPDT, o_cubD;          // weak cubature derivatives, and cubature derivatives
-  occa::memory o_intLIFT, o_intInterp;
+  //Setup PML elements
+  void PmlSetup();
+  void MultiRatePmlSetup();
 
-  //physical cubature coordinates
-  occa::memory o_cubx, o_cuby, o_cubz;
+  //Multirate partitioning
+  void MultiRateSetup(memory<dfloat> EToDT);
 
-  //physical surface cubature coordinates
-  occa::memory o_intx, o_inty, o_intz;
+  // Multirate trace halo
+  memory<ogs::halo_t> MultiRateHaloTraceSetup(int Nfields);
+
+  // Setup cubature
+  void CubatureSetup() {
+    switch (elementType) {
+      case Mesh::TRIANGLES:
+        CubatureSetupTri2D();
+        break;
+      case Mesh::QUADRILATERALS:
+        CubatureSetupQuad2D();
+        break;
+      case Mesh::TETRAHEDRA:
+        CubatureSetupTet3D();
+        break;
+      case Mesh::HEXAHEDRA:
+        CubatureSetupHex3D();
+        break;
+    }
+  }
+
+  // Setup cubature physical nodes
+  void CubaturePhysicalNodes() {
+    switch (elementType) {
+      case Mesh::TRIANGLES:
+        if (dim==2)
+          CubaturePhysicalNodesTri2D();
+        else
+          CubaturePhysicalNodesTri3D();
+        break;
+      case Mesh::QUADRILATERALS:
+        if (dim==2)
+          CubaturePhysicalNodesQuad2D();
+        else
+          CubaturePhysicalNodesQuad3D();
+        break;
+      case Mesh::TETRAHEDRA:
+        CubaturePhysicalNodesTet3D();
+        break;
+      case Mesh::HEXAHEDRA:
+        CubaturePhysicalNodesHex3D();
+        break;
+    }
+  }
 
-  // volume, surface, and second order geometric factors at cubature points
-  occa::memory o_cubvgeo, o_cubsgeo, o_cubggeo;
+  dfloat MinCharacteristicLength();
 
-  //pml lists
-  occa::memory o_pmlElements;
-  occa::memory o_nonPmlElements;
-  occa::memory o_pmlIds;
+  void PlotInterp(const memory<dfloat> q, memory<dfloat> Iq, memory<dfloat> scratch=memory<dfloat>()) {
+    switch (elementType) {
+      case Mesh::TRIANGLES:
+        PlotInterpTri2D(q, Iq, scratch);
+        break;
+      case Mesh::QUADRILATERALS:
+        PlotInterpQuad2D(q, Iq, scratch);
+        break;
+      case Mesh::TETRAHEDRA:
+        PlotInterpTet3D(q, Iq, scratch);
+        break;
+      case Mesh::HEXAHEDRA:
+        PlotInterpHex3D(q, Iq, scratch);
+        break;
+    }
+  }
+
+  void MassMatrixApply(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_Mq);
+  void MassMatrixKernelSetup(int Nfields) {
+    switch (elementType) {
+      case Mesh::TRIANGLES:
+        MassMatrixKernelSetupTri2D(Nfields);
+        break;
+      case Mesh::QUADRILATERALS:
+        MassMatrixKernelSetupQuad2D(Nfields);
+        break;
+      case Mesh::TETRAHEDRA:
+        MassMatrixKernelSetupTet3D(Nfields);
+        break;
+      case Mesh::HEXAHEDRA:
+        MassMatrixKernelSetupHex3D(Nfields);
+        break;
+    }
+  }
+
+  dfloat ElementCharacteristicLength(dlong e) {
+    switch (elementType) {
+      case Mesh::TRIANGLES:
+        return ElementCharacteristicLengthTri2D(e);
+      case Mesh::QUADRILATERALS:
+        return ElementCharacteristicLengthQuad2D(e);
+      case Mesh::TETRAHEDRA:
+        return ElementCharacteristicLengthTet3D(e);
+      case Mesh::HEXAHEDRA:
+        return ElementCharacteristicLengthHex3D(e);
+      default:
+        return 0.0;
+    }
+  }
 
-  //multirate lists
-  occa::memory o_mrLevel;
-  occa::memory o_mrNelements, o_mrInterfaceNelements;
-  occa::memory *o_mrElements, *o_mrInterfaceElements;
+  //create a new mesh object with the same geometry, but different degree
+  mesh_t SetupNewDegree(int Nf);
 
-  //multirate pml lists
-  occa::memory *o_mrPmlElements, *o_mrNonPmlElements;
-  occa::memory *o_mrPmlIds;
+  mesh_t SetupRingPatch();
 
-  occa::kernel MassMatrixKernel;
+  mesh_t SetupSEMFEM(memory<hlong>& globalIds, memory<int>& mapB);
 
-  mesh_t() = delete;
-  mesh_t(platform_t& _platform, meshSettings_t& _settings,
-         MPI_Comm _comm);
+  int RXID, RYID, RZID;
+  int SXID, SYID, SZID;
+  int TXID, TYID, TZID;
+  int JID, JWID, IJWID;
+  int G00ID, G01ID, G02ID, G11ID, G12ID, G22ID;
 
-  virtual ~mesh_t();
+  int NXID, NYID, NZID;
+  int SJID, IJID, IHID, WIJID, WSJID;
 
-  // generic mesh setup
-  static mesh_t& Setup(platform_t& _platform, meshSettings_t& _settings,
-                       MPI_Comm _comm);
+ private:
+  /*Set the type of mesh*/
+  void SetElementType(const Mesh::ElementType eType);
 
   // box mesh
-  virtual void SetupBox() = 0;
+  void SetupBox() {
+    switch (elementType) {
+      case Mesh::TRIANGLES:
+        SetupBoxTri2D();
+        break;
+      case Mesh::QUADRILATERALS:
+        SetupBoxQuad2D();
+        break;
+      case Mesh::TETRAHEDRA:
+        SetupBoxTet3D();
+        break;
+      case Mesh::HEXAHEDRA:
+        SetupBoxHex3D();
+        break;
+    }
+  }
+  void SetupBoxTri2D();
+  void SetupBoxQuad2D();
+  void SetupBoxTet3D();
+  void SetupBoxHex3D();
 
   // pml box mesh
-  virtual void SetupPmlBox() = 0;
+  void SetupPmlBox() {
+    switch (elementType) {
+      case Mesh::TRIANGLES:
+        SetupPmlBoxTri2D();
+        break;
+      case Mesh::QUADRILATERALS:
+        SetupPmlBoxQuad2D();
+        break;
+      case Mesh::TETRAHEDRA:
+        SetupPmlBoxTet3D();
+        break;
+      case Mesh::HEXAHEDRA:
+        SetupPmlBoxHex3D();
+        break;
+    }
+  }
+  void SetupPmlBoxTri2D();
+  void SetupPmlBoxQuad2D();
+  void SetupPmlBoxTet3D();
+  void SetupPmlBoxHex3D();
 
   // mesh reader
-  virtual void ParallelReader(const char *fileName) = 0;
-
-  // repartition elements in parallel
-  virtual void GeometricPartition() = 0;
+  void ReadGmsh(const std::string fileName) {
+    switch (elementType) {
+      case Mesh::TRIANGLES:
+        if(dim==2)
+          ReadGmshTri2D(fileName);
+        else
+          ReadGmshTri3D(fileName);
+        break;
+      case Mesh::QUADRILATERALS:
+        if(dim==2)
+          ReadGmshQuad2D(fileName);
+        else
+          ReadGmshQuad3D(fileName);
+        break;
+      case Mesh::TETRAHEDRA:
+        ReadGmshTet3D(fileName);
+        break;
+      case Mesh::HEXAHEDRA:
+        ReadGmshHex3D(fileName);
+        break;
+    }
+  }
+  void ReadGmshTri2D(const std::string fileName);
+  void ReadGmshTri3D(const std::string fileName);
+  void ReadGmshQuad2D(const std::string fileName);
+  void ReadGmshQuad3D(const std::string fileName);
+  void ReadGmshTet3D(const std::string fileName);
+  void ReadGmshHex3D(const std::string fileName);
+
+  // reference nodes and operators
+  void ReferenceNodes() {
+    switch (elementType) {
+      case Mesh::TRIANGLES:
+        ReferenceNodesTri2D();
+        break;
+      case Mesh::QUADRILATERALS:
+        ReferenceNodesQuad2D();
+        break;
+      case Mesh::TETRAHEDRA:
+        ReferenceNodesTet3D();
+        break;
+      case Mesh::HEXAHEDRA:
+        ReferenceNodesHex3D();
+        break;
+    }
+  }
+  void ReferenceNodesTri2D();
+  void ReferenceNodesQuad2D();
+  void ReferenceNodesTet3D();
+  void ReferenceNodesHex3D();
+
+  // repartition elements
+  void Partition();
 
   /* build parallel face connectivity */
-  void ParallelConnect();
   void Connect();
 
   // build element-boundary connectivity
   void ConnectBoundary();
 
-  virtual void ReferenceNodes(int N) = 0;
+  // face-vertex to face-vertex connection
+  void ConnectFaceVertices();
 
-  /* compute x,y,z coordinates of each node */
-  virtual void PhysicalNodes() = 0;
-
-  // compute geometric factors for local to physical map
-  virtual void GeometricFactors() = 0;
-
-  virtual void SurfaceGeometricFactors() = 0;
-
-  // serial face-node to face-node connection
-  virtual void ConnectFaceNodes() = 0;
+  // face-node to face-node connection
+  void ConnectFaceNodes();
 
   // setup halo region
   void HaloSetup();
 
-  // setup trace halo
-  void HaloRingSetup();
-
-  // setup trace halo
-  halo_t* HaloTraceSetup(int Nfields);
-
   /* build global connectivity in parallel */
-  void ParallelConnectNodes();
+  void ConnectNodes();
 
   /* build global gather scatter ops */
-  void ParallelGatherScatterSetup();
+  void GatherScatterSetup();
 
-  //Setup PML elements
-  void PmlSetup();
-  void MultiRatePmlSetup();
-
-  //Multirate partitioning
-  void MultiRateSetup(dfloat *EToDT);
-
-  // Multirate trace halo
-  halo_t** MultiRateHaloTraceSetup(int Nfields);
-
-  virtual void OccaSetup();
-
-  virtual void CubatureSetup()=0;
-
-  virtual void CubatureNodes()=0;
-
-  // print out parallel partition i
-  void PrintPartitionStatistics();
-
-  virtual dfloat ElementCharacteristicLength(dlong e) = 0;
-
-  dfloat MinCharacteristicLength();
-
-  virtual void PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch=nullptr)=0;
-
-  void RecursiveSpectralBisectionPartition();
-
-  void MassMatrixApply(occa::memory& o_q, occa::memory& o_Mq);
-  virtual void MassMatrixKernelSetup(int Nfields)=0;
-
-  //create a new mesh object with the same geometry, but different degree
-  mesh_t& SetupNewDegree(int Nf);
-
-  mesh_t* SetupRingPatch();
-
-  mesh_t* SetupSEMFEM(hlong **globalIds, int *Nfp, int **faceNodes);
+  /* compute x,y,z coordinates of each node */
+  void PhysicalNodes() {
+    switch (elementType) {
+      case Mesh::TRIANGLES:
+        if(dim==2)
+          PhysicalNodesTri2D();
+        else
+          PhysicalNodesTri3D();
+        break;
+      case Mesh::QUADRILATERALS:
+        if(dim==2)
+          PhysicalNodesQuad2D();
+        else
+          PhysicalNodesQuad3D();
+        break;
+      case Mesh::TETRAHEDRA:
+        PhysicalNodesTet3D();
+        break;
+      case Mesh::HEXAHEDRA:
+        PhysicalNodesHex3D();
+        break;
+    }
+  }
+  void PhysicalNodesTri2D();
+  void PhysicalNodesTri3D();
+  void PhysicalNodesQuad2D();
+  void PhysicalNodesQuad3D();
+  void PhysicalNodesTet3D();
+  void PhysicalNodesHex3D();
 
-  void DegreeRaiseMatrix1D(int Nc, int Nf, dfloat *P);
-  void DegreeRaiseMatrixTri2D(int Nc, int Nf, dfloat *P);
-  void DegreeRaiseMatrixTet3D(int Nc, int Nf, dfloat *P);
+  // compute geometric factors for local to physical map
+  void GeometricFactors() {
+    switch (elementType) {
+      case Mesh::TRIANGLES:
+        if(dim==2)
+          GeometricFactorsTri2D();
+        else
+          GeometricFactorsTri3D();
+        break;
+      case Mesh::QUADRILATERALS:
+        if(dim==2)
+          GeometricFactorsQuad2D();
+        else
+          GeometricFactorsQuad3D();
+        break;
+      case Mesh::TETRAHEDRA:
+        GeometricFactorsTet3D();
+        break;
+      case Mesh::HEXAHEDRA:
+        GeometricFactorsHex3D();
+        break;
+    }
+  }
+  void GeometricFactorsTri2D();
+  void GeometricFactorsTri3D();
+  void GeometricFactorsQuad2D();
+  void GeometricFactorsQuad3D();
+  void GeometricFactorsTet3D();
+  void GeometricFactorsHex3D();
+
+  void SurfaceGeometricFactors() {
+    switch (elementType) {
+      case Mesh::TRIANGLES:
+        if(dim==2)
+          SurfaceGeometricFactorsTri2D();
+        else
+          SurfaceGeometricFactorsTri3D();
+        break;
+      case Mesh::QUADRILATERALS:
+        if(dim==2)
+          SurfaceGeometricFactorsQuad2D();
+        else
+          SurfaceGeometricFactorsQuad3D();
+        break;
+      case Mesh::TETRAHEDRA:
+        SurfaceGeometricFactorsTet3D();
+        break;
+      case Mesh::HEXAHEDRA:
+        SurfaceGeometricFactorsHex3D();
+        break;
+    }
+  }
+  void SurfaceGeometricFactorsTri2D();
+  void SurfaceGeometricFactorsTri3D();
+  void SurfaceGeometricFactorsQuad2D();
+  void SurfaceGeometricFactorsQuad3D();
+  void SurfaceGeometricFactorsTet3D();
+  void SurfaceGeometricFactorsHex3D();
+
+  void CubatureSetupTri2D();
+  void CubatureSetupQuad2D();
+  void CubatureSetupTet3D();
+  void CubatureSetupHex3D();
+
+  void CubaturePhysicalNodesTri2D();
+  void CubaturePhysicalNodesTri3D();
+  void CubaturePhysicalNodesQuad2D();
+  void CubaturePhysicalNodesQuad3D();
+  void CubaturePhysicalNodesTet3D();
+  void CubaturePhysicalNodesHex3D();
+
+  void PlotInterpTri2D(const memory<dfloat> q, memory<dfloat> Iq, memory<dfloat> scratch);
+  void PlotInterpQuad2D(const memory<dfloat> q, memory<dfloat> Iq, memory<dfloat> scratch);
+  void PlotInterpTet3D(const memory<dfloat> q, memory<dfloat> Iq, memory<dfloat> scratch);
+  void PlotInterpHex3D(const memory<dfloat> q, memory<dfloat> Iq, memory<dfloat> scratch);
+
+  void MassMatrixKernelSetupTri2D(int Nfields);
+  void MassMatrixKernelSetupQuad2D(int Nfields);
+  void MassMatrixKernelSetupTet3D(int Nfields);
+  void MassMatrixKernelSetupHex3D(int Nfields);
+
+  dfloat ElementCharacteristicLengthTri2D(dlong e);
+  dfloat ElementCharacteristicLengthQuad2D(dlong e);
+  dfloat ElementCharacteristicLengthTet3D(dlong e);
+  dfloat ElementCharacteristicLengthHex3D(dlong e);
 
   /***************************************************************************/
   // Basic codes for generating nodes, polynomials, matrices, etc.
 
-public:
+ public:
   //1D
-  static void Nodes1D(int N, dfloat *r);
-  static void EquispacedNodes1D(int _N, dfloat *_r);
-  static void OrthonormalBasis1D(dfloat a, int i, dfloat *P);
-  static void GradOrthonormalBasis1D(dfloat a, int i, dfloat *Pr);
-  static void Vandermonde1D(int N, int Npoints, dfloat *r, dfloat *V);
-  static void GradVandermonde1D(int N, int Npoints, dfloat *r, dfloat *Vr);
-
-  static void MassMatrix1D(int _Np, dfloat *V, dfloat *MM);
-  static void Dmatrix1D(int _N, int NpointsIn, dfloat *_rIn, int NpointsOut, dfloat *_rOut, dfloat *_Dr);
-  static void InterpolationMatrix1D(int _N,int NpointsIn, dfloat *rIn, int NpointsOut, dfloat *rOut, dfloat *I);
-  static void CubatureWeakDmatrix1D(int _Nq, int _cubNq, dfloat *_cubProject, dfloat *_cubD, dfloat *_cubPDT);
+  static void Nodes1D(const int _N, memory<dfloat>& _r);
+  static void EquispacedNodes1D(const int _N, memory<dfloat>& _r);
+  static void OrthonormalBasis1D(const dfloat a, const int i, dfloat& P);
+  static void GradOrthonormalBasis1D(const dfloat a, const int i, dfloat& Pr);
+  static void Vandermonde1D(const int _N,
+                            const memory<dfloat> _r,
+                            memory<dfloat>& V);
+  static void GradVandermonde1D(const int _N,
+                                const memory<dfloat> _r,
+                                memory<dfloat>& Vr);
+
+  static void MassMatrix1D(const int _Np,
+                           const memory<dfloat> V,
+                           memory<dfloat>& _MM);
+  static void Dmatrix1D(const int _N,
+                        const memory<dfloat> _rIn,
+                        const memory<dfloat> _rOut,
+                        memory<dfloat>& _Dr);
+  static void InterpolationMatrix1D(const int _N,
+                                    const memory<dfloat> _rIn,
+                                    const memory<dfloat> _rOut,
+                                    memory<dfloat>& I);
+  static void DegreeRaiseMatrix1D(const int Nc, const int Nf,
+                                  memory<dfloat>& P);
+  static void CubatureWeakDmatrix1D(const int _Nq, const int _cubNq,
+                                    const memory<dfloat> _cubProject,
+                                    const memory<dfloat> _cubD,
+                                    memory<dfloat>& _cubPDT);
 
   //Jacobi polynomial evaluation
-  static dfloat JacobiP(dfloat a, dfloat alpha, dfloat beta, int N);
-  static dfloat GradJacobiP(dfloat a, dfloat alpha, dfloat beta, int N);
+  static dfloat JacobiP(const dfloat a, const dfloat alpha,
+                        const dfloat beta, const int _N);
+  static dfloat GradJacobiP(const dfloat a, const dfloat alpha,
+                            const dfloat beta, const int _N);
 
   //Gauss-Legendre-Lobatto quadrature nodes
-  static void JacobiGLL(int N, dfloat *x, dfloat *w=NULL);
+  static void JacobiGLL(const int _N,
+                        memory<dfloat>& _x);
+  static void JacobiGLL(const int _N,
+                        memory<dfloat>& _x,
+                        memory<dfloat>& _w);
 
   //Nth order Gauss-Jacobi quadrature nodes and weights
-  static void JacobiGQ(dfloat alpha, dfloat beta, int N, dfloat *x, dfloat *w);
+  static void JacobiGQ(const dfloat alpha, const dfloat beta,
+                       const int _N,
+                       memory<dfloat>& _x,
+                       memory<dfloat>& _w);
 
   //Tris
-  static void NodesTri2D(int _N, dfloat *_r, dfloat *_s);
-  static void FaceNodesTri2D(int _N, dfloat *_r, dfloat *_s, int *_faceNodes);
-  static void VertexNodesTri2D(int _N, dfloat *_r, dfloat *_s, int *_vertexNodes);
-  static void EquispacedNodesTri2D(int _N, dfloat *_r, dfloat *_s);
-  static void EquispacedEToVTri2D(int _N, int *_EToV);
-  static void SEMFEMNodesTri2D(int _N, int *_Np, dfloat **_r, dfloat **_s);
-  static void SEMFEMEToVTri2D(int _N, int *_NelFEM, int **_EToV);
-  static void OrthonormalBasisTri2D(dfloat a, dfloat b, int i, int j, dfloat *P);
-  static void GradOrthonormalBasisTri2D(dfloat a, dfloat b, int i, int j, dfloat *Pr, dfloat *Ps);
-  static void VandermondeTri2D(int N, int Npoints, dfloat *r, dfloat *s, dfloat *V);
-  static void GradVandermondeTri2D(int N, int Npoints, dfloat *r, dfloat *s, dfloat *Vr, dfloat *Vs);
-  static void MassMatrixTri2D(int _Np, dfloat *V, dfloat *_MM);
-  static void invMassMatrixTri2D(int _Np, dfloat *V, dfloat *_invMM);
-  static void DmatrixTri2D(int _N, int Npoints, dfloat *_r, dfloat *_s,
-                           dfloat *_Dr, dfloat *_Ds);
-  static void LIFTmatrixTri2D(int _N, int *_faceNodes,
-                              dfloat *_r, dfloat *_s, dfloat *_LIFT);
-  static void SurfaceMassMatrixTri2D(int _N, dfloat *_MM, dfloat *_LIFT, dfloat *_sM);
-  static void SmatrixTri2D(int _N, dfloat *_Dr, dfloat *_Ds, dfloat *_MM,
-                           dfloat *_Srr, dfloat *_Srs, dfloat *_Sss);
-  static void InterpolationMatrixTri2D(int _N,
-                                       int NpointsIn, dfloat *rIn, dfloat *sIn,
-                                       int NpointsOut, dfloat *rOut, dfloat *sOut,
-                                       dfloat *I);
-  static void CubatureNodesTri2D(int cubTriN, int*cubNp, dfloat **cubTrir, dfloat **cubTris, dfloat **cubTriw);
-  static void CubaturePmatrixTri2D(int _N, int _Np, dfloat *_r, dfloat *_s,
-                                   int _cubNp, dfloat *_cubr, dfloat *_cubs, dfloat *_cubProject);
-  static void CubatureWeakDmatricesTri2D(int _N, int _Np, dfloat *_r, dfloat *_s,
-                                         int _cubNp, dfloat *_cubr, dfloat *_cubs,
-                                         dfloat *_cubPDrT, dfloat *_cubPDsT);
-  static void CubatureSurfaceMatricesTri2D(int _N, int _Np, dfloat *_r, dfloat *_s, int *_faceNodes,
-                                           int _intNfp, dfloat *_intr, dfloat *_intw,
-                                           dfloat *_intInterp, dfloat *_intLIFT);
-  static void SEMFEMInterpMatrixTri2D(int _N,
-                                      int _Np, dfloat *_r, dfloat *_s,
-                                      int _NpFEM, dfloat *rFEM, dfloat *sFEM,
-                                      dfloat *I);
-
-  static void Warpfactor(int _N, int Npoints, dfloat *r, dfloat *w);
-  static void WarpBlendTransformTri2D(int _N, int _Npoints, dfloat *_r, dfloat *_s, dfloat alphaIn=-1);
+  static void NodesTri2D(const int _N,
+                         memory<dfloat>& _r,
+                         memory<dfloat>& _s);
+  static void FaceNodesTri2D(const int _N,
+                             const memory<dfloat> _r,
+                             const memory<dfloat> _s,
+                             memory<int>& _faceNodes);
+  static void VertexNodesTri2D(const int _N,
+                               const memory<dfloat> _r,
+                               const memory<dfloat> _s,
+                               memory<int>& _vertexNodes);
+  static void FaceNodeMatchingTri2D(const memory<dfloat> _r,
+                                    const memory<dfloat> _s,
+                                    const memory<int> _faceNodes,
+                                    const memory<int> _faceVertices,
+                                    memory<int>& R);
+  static void EquispacedNodesTri2D(const int _N,
+                                   memory<dfloat>& _r,
+                                   memory<dfloat>& _s);
+  static void EquispacedEToVTri2D(const int _N, memory<int>& _EToV);
+  static void SEMFEMNodesTri2D(const int _N,
+                               int& _Np,
+                               memory<dfloat>& _r,
+                               memory<dfloat>& _s);
+  static void SEMFEMEToVTri2D(const int _N,
+                              int& _NelFEM,
+                              memory<int>& _EToV);
+  static void OrthonormalBasisTri2D(const dfloat _r, const dfloat _s,
+                                    const int i, const int j,
+                                    dfloat& P);
+  static void GradOrthonormalBasisTri2D(const dfloat _r, const dfloat _s,
+                                        const int i, const int j,
+                                        dfloat& Pr, dfloat& Ps);
+  static void VandermondeTri2D(const int _N,
+                               const memory<dfloat> _r,
+                               const memory<dfloat> _s,
+                               memory<dfloat>& V);
+  static void GradVandermondeTri2D(const int _N,
+                                   const memory<dfloat> _r,
+                                   const memory<dfloat> _s,
+                                   memory<dfloat>& Vr,
+                                   memory<dfloat>& Vs);
+  static void MassMatrixTri2D(const int _Np,
+                              const memory<dfloat> V,
+                              memory<dfloat>& _MM);
+  static void invMassMatrixTri2D(const int _Np,
+                                 const memory<dfloat> V,
+                                 memory<dfloat>& _invMM);
+  static void DmatrixTri2D(const int _N,
+                           const memory<dfloat> _r,
+                           const memory<dfloat> _s,
+                           memory<dfloat>& _D);
+  static void LIFTmatrixTri2D(const int _N,
+                              const memory<int> _faceNodes,
+                              const memory<dfloat> _r,
+                              const memory<dfloat> _s,
+                              memory<dfloat>& _LIFT);
+  static void SurfaceMassMatrixTri2D(const int _N,
+                                     const memory<dfloat> _MM,
+                                     const memory<dfloat> _LIFT,
+                                     memory<dfloat>& _sM);
+  static void SmatrixTri2D(const int _N,
+                           const memory<dfloat> _Dr,
+                           const memory<dfloat> _Ds,
+                           const memory<dfloat> _MM,
+                           memory<dfloat>& _S);
+  static void InterpolationMatrixTri2D(const int _N,
+                                       const memory<dfloat> rIn,
+                                       const memory<dfloat> sIn,
+                                       const memory<dfloat> rOut,
+                                       const memory<dfloat> sOut,
+                                       memory<dfloat>& I);
+  static void DegreeRaiseMatrixTri2D(const int Nc, const int Nf,
+                                     memory<dfloat>& P);
+  static void CubatureNodesTri2D(const int cubTriN,
+                                 int& _cubNp,
+                                 memory<dfloat>& cubTrir,
+                                 memory<dfloat>& cubTris,
+                                 memory<dfloat>& cubTriw);
+  static void CubaturePmatrixTri2D(const int _N,
+                                   const memory<dfloat> _r,
+                                   const memory<dfloat> _s,
+                                   const memory<dfloat> _cubr,
+                                   const memory<dfloat> _cubs,
+                                   memory<dfloat>& _cubProject);
+  static void CubatureWeakDmatricesTri2D(const int _N,
+                                         const memory<dfloat> _r,
+                                         const memory<dfloat> _s,
+                                         const memory<dfloat> _cubr,
+                                         const memory<dfloat> _cubs,
+                                         memory<dfloat>& _cubPDT);
+  static void CubatureSurfaceMatricesTri2D(const int _N,
+                                           const memory<dfloat> _r,
+                                           const memory<dfloat> _s,
+                                           const memory<int> _faceNodes,
+                                           const memory<dfloat> _intr,
+                                           const memory<dfloat> _intw,
+                                           memory<dfloat>& _intInterp,
+                                           memory<dfloat>& _intLIFT);
+  static void SEMFEMInterpMatrixTri2D(const int _N,
+                                      const memory<dfloat> _r,
+                                      const memory<dfloat> _s,
+                                      const memory<dfloat> _rFEM,
+                                      const memory<dfloat> _sFEM,
+                                      memory<dfloat>& I);
+
+  static void Warpfactor(const int _N,
+                         const memory<dfloat> _r,
+                         memory<dfloat> warp);
+  static void WarpBlendTransformTri2D(const int _N,
+                                      memory<dfloat> _r,
+                                      memory<dfloat> _s,
+                                      const dfloat alphaIn=-1);
 
 
   //Quads
-  static void NodesQuad2D(int _N, dfloat *_r, dfloat *_s);
-  static void FaceNodesQuad2D(int _N, dfloat *_r, dfloat *_s, int *_faceNodes);
-  static void VertexNodesQuad2D(int _N, dfloat *_r, dfloat *_s, int *_vertexNodes);
-  static void EquispacedNodesQuad2D(int _N, dfloat *_r, dfloat *_s);
-  static void EquispacedEToVQuad2D(int _N, int *_EToV);
-  static void SEMFEMEToVQuad2D(int _N, int *_EToV);
-  static void OrthonormalBasisQuad2D(dfloat a, dfloat b, int i, int j, dfloat *P);
-  static void GradOrthonormalBasisQuad2D(dfloat a, dfloat b, int i, int j, dfloat *Pr, dfloat *Ps);
-  static void VandermondeQuad2D(int N, int Npoints, dfloat *r, dfloat *s, dfloat *V);
-  static void GradVandermondeQuad2D(int N, int Npoints, dfloat *r, dfloat *s, dfloat *Vr, dfloat *Vs);
-  static void MassMatrixQuad2D(int _Np, dfloat *V, dfloat *_MM);
-  static void LumpedMassMatrixQuad2D(int _N, dfloat *_gllw, dfloat *_MM);
-  static void invLumpedMassMatrixQuad2D(int _N, dfloat *_gllw, dfloat *_invMM);
-  static void DmatrixQuad2D(int _N, int Npoints, dfloat *_r, dfloat *_s,
-                                          dfloat *_Dr, dfloat *_Ds);
-  static void InterpolationMatrixQuad2D(int _N,
-                                        int NpointsIn, dfloat *rIn, dfloat *sIn,
-                                        int NpointsOut, dfloat *rOut, dfloat *sOut,
-                                        dfloat *I);
+  static void NodesQuad2D(const int _N,
+                          memory<dfloat>& _r,
+                          memory<dfloat>& _s);
+  static void FaceNodesQuad2D(const int _N,
+                              const memory<dfloat> _r,
+                              const memory<dfloat> _s,
+                              memory<int>& _faceNodes);
+  static void VertexNodesQuad2D(const int _N,
+                                const memory<dfloat> _r,
+                                const memory<dfloat> _s,
+                                memory<int>& _vertexNodes);
+  static void FaceNodeMatchingQuad2D(const memory<dfloat> _r,
+                                     const memory<dfloat> _s,
+                                     const memory<int> _faceNodes,
+                                     const memory<int> _faceVertices,
+                                     memory<int>& R);
+  static void EquispacedNodesQuad2D(const int _N,
+                                    memory<dfloat>& _r,
+                                    memory<dfloat>& _s);
+  static void EquispacedEToVQuad2D(const int _N, memory<int>& _EToV);
+  static void SEMFEMEToVQuad2D(const int _N, memory<int>& _EToV);
+  static void OrthonormalBasisQuad2D(const dfloat a, const dfloat b,
+                                     const int i, const int j,
+                                     dfloat& P);
+  static void GradOrthonormalBasisQuad2D(const dfloat a, const dfloat b,
+                                         const int i, const int j,
+                                         dfloat& Pr, dfloat& Ps);
+  static void VandermondeQuad2D(const int _N,
+                                const memory<dfloat> _r,
+                                const memory<dfloat> _s,
+                                memory<dfloat>& V);
+  static void GradVandermondeQuad2D(const int _N,
+                                    const memory<dfloat> _r,
+                                    const memory<dfloat> _s,
+                                    memory<dfloat>& Vr,
+                                    memory<dfloat>& Vs);
+  static void MassMatrixQuad2D(const int _Np,
+                               const memory<dfloat> V,
+                               memory<dfloat>& _MM);
+  static void LumpedMassMatrixQuad2D(const int _N,
+                                     const memory<dfloat> _gllw,
+                                     memory<dfloat>& _MM);
+  static void invLumpedMassMatrixQuad2D(const int _N,
+                                        const memory<dfloat> _gllw,
+                                        memory<dfloat>& _invMM);
+  static void DmatrixQuad2D(const int _N,
+                            const memory<dfloat> _r,
+                            const memory<dfloat> _s,
+                            memory<dfloat>& _D);
+  static void InterpolationMatrixQuad2D(const int _N,
+                                        const memory<dfloat> rIn,
+                                        const memory<dfloat> sIn,
+                                        const memory<dfloat> rOut,
+                                        const memory<dfloat> sOut,
+                                        memory<dfloat>& I);
 
   //Tets
-  static void NodesTet3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t);
-  static void FaceNodesTet3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_faceNodes);
-  static void VertexNodesTet3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_vertexNodes);
-  static void EquispacedNodesTet3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t);
-  static void EquispacedEToVTet3D(int _N, int *_EToV);
-  static void SEMFEMEToVTet3D(int _N, int *_EToV);
-  static void OrthonormalBasisTet3D(dfloat a, dfloat b, dfloat c, int i, int j, int k, dfloat *P);
-  static void GradOrthonormalBasisTet3D(dfloat a, dfloat b, dfloat c, int i, int j, int k, dfloat *Pr, dfloat *Ps, dfloat *Pt);
-  static void VandermondeTet3D(int N, int Npoints, dfloat *r, dfloat *s, dfloat *t, dfloat *V);
-  static void GradVandermondeTet3D(int N, int Npoints, dfloat *r, dfloat *s, dfloat *t, dfloat *Vr, dfloat *Vs, dfloat *Vt);
-  static void MassMatrixTet3D(int _Np, dfloat *V, dfloat *_MM);
-  static void invMassMatrixTet3D(int _Np, dfloat *V, dfloat *_invMM);
-  static void DmatrixTet3D(int _N, int Npoints, dfloat *_r, dfloat *_s, dfloat *_t,
-                           dfloat *_Dr, dfloat *_Ds, dfloat *_Dt);
-  static void LIFTmatrixTet3D(int _N, int *_faceNodes,
-                              dfloat *_r, dfloat *_s, dfloat *_t, dfloat *_LIFT);
-  static void SurfaceMassMatrixTet3D(int _N, dfloat *_MM, dfloat *_LIFT, dfloat *_sM);
-  static void SmatrixTet3D(int _N, dfloat *_Dr, dfloat *_Ds, dfloat *_Dt, dfloat *_MM,
-                           dfloat *_Srr, dfloat *_Srs, dfloat *_Srt,
-                           dfloat *_Sss, dfloat *_Sst, dfloat *_Stt);
-  static void InterpolationMatrixTet3D(int _N,
-                                       int NpointsIn, dfloat *rIn, dfloat *sIn, dfloat *tIn,
-                                       int NpointsOut, dfloat *rOut, dfloat *sOut, dfloat *tOut,
-                                       dfloat *I);
-  static void CubatureNodesTet3D(int cubN, int*cubNp, dfloat **cubr, dfloat **cubs, dfloat **cubt, dfloat **cubw);
-  static void CubaturePmatrixTet3D(int _N, int _Np, dfloat *_r, dfloat *_s, dfloat *_t,
-                                   int _cubNp, dfloat *_cubr, dfloat *_cubs, dfloat *_cubt,
-                                   dfloat *_cubProject);
-  static void CubatureWeakDmatricesTet3D(int _N, int _Np, dfloat *_r, dfloat *_s, dfloat *_t,
-                                         int _cubNp, dfloat *_cubr, dfloat *_cubs, dfloat *_cubt,
-                                         dfloat *_cubPDrT, dfloat *_cubPDsT, dfloat *_cubPDtT);
-  static void CubatureSurfaceMatricesTet3D(int _N, int _Np, dfloat *_r, dfloat *_s, dfloat *_t, int *_faceNodes,
-                                           int _intNfp, dfloat *_intr, dfloat *_ints, dfloat *_intw,
-                                           dfloat *_intInterp, dfloat *_intLIFT);
-  static void SEMFEMInterpMatrixTet3D(int _N, int _Np, dfloat *_r, dfloat *_s, dfloat *_t,
-                                      int _NpFEM, dfloat *_rFEM, dfloat *_sFEM, dfloat *_tFEM,
-                                      dfloat *I);
-  static void WarpShiftFace3D(int _N, int Npoints, dfloat alpha,
-                              dfloat *L1, dfloat *L2, dfloat *L3,
-                              dfloat *w1, dfloat *w2);
-  static void WarpBlendTransformTet3D(int _N, int _Npoints, dfloat *_r, dfloat *_s, dfloat *_t, dfloat alphaIn=-1);
+  static void NodesTet3D(const int _N,
+                         memory<dfloat>& _r,
+                         memory<dfloat>& _s,
+                         memory<dfloat>& _t);
+  static void FaceNodesTet3D(const int _N,
+                             const memory<dfloat> _r,
+                             const memory<dfloat> _s,
+                             const memory<dfloat> _t,
+                             memory<int>& _faceNodes);
+  static void VertexNodesTet3D(const int _N,
+                               const memory<dfloat> _r,
+                               const memory<dfloat> _s,
+                               const memory<dfloat> _t,
+                               memory<int>& _vertexNodes);
+  static void FaceNodeMatchingTet3D(const memory<dfloat> _r,
+                                    const memory<dfloat> _s,
+                                    const memory<dfloat> _t,
+                                    const memory<int> _faceNodes,
+                                    const memory<int> _faceVertices,
+                                    memory<int>& R);
+  static void EquispacedNodesTet3D(const int _N,
+                                   memory<dfloat>& _r,
+                                   memory<dfloat>& _s,
+                                   memory<dfloat>& _t);
+  static void EquispacedEToVTet3D(const int _N, memory<int>& _EToV);
+  static void SEMFEMEToVTet3D(const int _N, memory<int>& _EToV);
+  static void OrthonormalBasisTet3D(const dfloat _r, const dfloat _s, const dfloat _t,
+                                    const int i, const int j, const int k,
+                                    dfloat& P);
+  static void GradOrthonormalBasisTet3D(const dfloat _r, const dfloat _s, const dfloat _t,
+                                        const int i, const int j, const int k,
+                                        dfloat& Pr, dfloat& Ps, dfloat& Pt);
+  static void VandermondeTet3D(const int _N,
+                               const memory<dfloat> _r,
+                               const memory<dfloat> _s,
+                               const memory<dfloat> _t,
+                               memory<dfloat>& V);
+  static void GradVandermondeTet3D(const int _N,
+                                   const memory<dfloat> _r,
+                                   const memory<dfloat> _s,
+                                   const memory<dfloat> _t,
+                                   memory<dfloat>& Vr,
+                                   memory<dfloat>& Vs,
+                                   memory<dfloat>& Vt);
+  static void MassMatrixTet3D(const int _Np,
+                              const memory<dfloat> V,
+                              memory<dfloat>& _MM);
+  static void invMassMatrixTet3D(const int _Np,
+                                 const memory<dfloat> V,
+                                 memory<dfloat>& _invMM);
+  static void DmatrixTet3D(const int _N,
+                           const memory<dfloat> _r,
+                           const memory<dfloat> _s,
+                           const memory<dfloat> _t,
+                           memory<dfloat>& _D);
+  static void LIFTmatrixTet3D(const int _N,
+                              const memory<int> _faceNodes,
+                              const memory<dfloat> _r,
+                              const memory<dfloat> _s,
+                              const memory<dfloat> _t,
+                              memory<dfloat>& _LIFT);
+  static void SurfaceMassMatrixTet3D(const int _N,
+                                     const memory<dfloat> _MM,
+                                     const memory<dfloat> _LIFT,
+                                     memory<dfloat>& _sM);
+  static void SmatrixTet3D(const int _N,
+                           const memory<dfloat> _Dr,
+                           const memory<dfloat> _Ds,
+                           const memory<dfloat> _Dt,
+                           const memory<dfloat> _MM,
+                           memory<dfloat>& _S);
+  static void InterpolationMatrixTet3D(const int _N,
+                                       const memory<dfloat> rIn,
+                                       const memory<dfloat> sIn,
+                                       const memory<dfloat> tIn,
+                                       const memory<dfloat> rOut,
+                                       const memory<dfloat> sOut,
+                                       const memory<dfloat> tOut,
+                                       memory<dfloat>& I);
+  static void DegreeRaiseMatrixTet3D(const int Nc, const int Nf,
+                                     memory<dfloat>& P);
+  static void CubatureNodesTet3D(const int cubTetN,
+                                 int& _cubNp,
+                                 memory<dfloat>& _cubr,
+                                 memory<dfloat>& _cubs,
+                                 memory<dfloat>& _cubt,
+                                 memory<dfloat>& _cubw);
+  static void CubaturePmatrixTet3D(const int _N,
+                                   const memory<dfloat> _r,
+                                   const memory<dfloat> _s,
+                                   const memory<dfloat> _t,
+                                   const memory<dfloat> _cubr,
+                                   const memory<dfloat> _cubs,
+                                   const memory<dfloat> _cubt,
+                                   memory<dfloat>& _cubProject);
+  static void CubatureWeakDmatricesTet3D(const int _N,
+                                         const memory<dfloat> _r,
+                                         const memory<dfloat> _s,
+                                         const memory<dfloat> _t,
+                                         const memory<dfloat> _cubr,
+                                         const memory<dfloat> _cubs,
+                                         const memory<dfloat> _cubt,
+                                         memory<dfloat>& _cubPDT);
+  static void CubatureSurfaceMatricesTet3D(const int _N,
+                                           const memory<dfloat> _r,
+                                           const memory<dfloat> _s,
+                                           const memory<dfloat> _t,
+                                           const memory<int> _faceNodes,
+                                           const memory<dfloat> _intr,
+                                           const memory<dfloat> _ints,
+                                           const memory<dfloat> _intw,
+                                           memory<dfloat>& _intInterp,
+                                           memory<dfloat>& _intLIFT);
+  static void SEMFEMInterpMatrixTet3D(const int _N,
+                                      const memory<dfloat> _r,
+                                      const memory<dfloat> _s,
+                                      const memory<dfloat> _t,
+                                      const memory<dfloat> _rFEM,
+                                      const memory<dfloat> _sFEM,
+                                      const memory<dfloat> _tFEM,
+                                      memory<dfloat>& I);
+  static void WarpShiftFace3D(const int _N, const dfloat alpha,
+                              const memory<dfloat> L1,
+                              const memory<dfloat> L2,
+                              const memory<dfloat> L3,
+                              memory<dfloat> w1,
+                              memory<dfloat> w2);
+  static void WarpBlendTransformTet3D(const int _N,
+                                      memory<dfloat> _r,
+                                      memory<dfloat> _s,
+                                      memory<dfloat> _t,
+                                      const dfloat alphaIn=-1);
 
 
   //Hexs
-  static void NodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t);
-  static void FaceNodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t,  int *_faceNodes);
-  static void VertexNodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_vertexNodes);
-  static void EquispacedNodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t);
-  static void EquispacedEToVHex3D(int _N, int *_EToV);
-  static void SEMFEMEToVHex3D(int _N, int *_EToV);
-  static void OrthonormalBasisHex3D(dfloat a, dfloat b, dfloat c, int i, int j, int k, dfloat *P);
-  static void GradOrthonormalBasisHex3D(dfloat a, dfloat b, dfloat c, int i, int j, int k, dfloat *Pr, dfloat *Ps, dfloat *Pt);
-  static void VandermondeHex3D(int N, int Npoints, dfloat *r, dfloat *s, dfloat *t, dfloat *V);
-  static void GradVandermondeHex3D(int N, int Npoints, dfloat *r, dfloat *s, dfloat *t,
-                                   dfloat *Vr, dfloat *Vs, dfloat *Vt);
-  static void MassMatrixHex3D(int _Np, dfloat *V, dfloat *_MM);
-  static void LumpedMassMatrixHex3D(int _N, dfloat *_gllw, dfloat *_MM);
-  static void invLumpedMassMatrixHex3D(int _N, dfloat *_gllw, dfloat *_invMM);
-  static void DmatrixHex3D(int _N, int Npoints, dfloat *_r, dfloat *_s, dfloat *_t,
-                           dfloat *_Dr, dfloat *_Ds, dfloat *_Dt);
-  static void InterpolationMatrixHex3D(int _N,
-                                       int NpointsIn, dfloat *rIn, dfloat *sIn, dfloat *tIn,
-                                       int NpointsOut, dfloat *rOut, dfloat *sOut, dfloat *tOut,
-                                       dfloat *I);
+  static void NodesHex3D(const int _N,
+                         memory<dfloat>& _r,
+                         memory<dfloat>& _s,
+                         memory<dfloat>& _t);
+  static void FaceNodesHex3D(const int _N,
+                             const memory<dfloat> _r,
+                             const memory<dfloat> _s,
+                             const memory<dfloat> _t,
+                             memory<int>& _faceNodes);
+  static void VertexNodesHex3D(const int _N,
+                               const memory<dfloat> _r,
+                               const memory<dfloat> _s,
+                               const memory<dfloat> _t,
+                               memory<int>& _vertexNodes);
+  static void FaceNodeMatchingHex3D(const memory<dfloat> _r,
+                                    const memory<dfloat> _s,
+                                    const memory<dfloat> _t,
+                                    const memory<int> _faceNodes,
+                                    const memory<int> _faceVertices,
+                                    memory<int>& R);
+  static void EquispacedNodesHex3D(const int _N,
+                                   memory<dfloat>& _r,
+                                   memory<dfloat>& _s,
+                                   memory<dfloat>& _t);
+  static void EquispacedEToVHex3D(const int _N, memory<int>& _EToV);
+  static void SEMFEMEToVHex3D(const int _N, memory<int>& _EToV);
+  static void OrthonormalBasisHex3D(const dfloat a, const dfloat b, const dfloat c,
+                                    const int i, const int j, const int k,
+                                    dfloat& P);
+  static void GradOrthonormalBasisHex3D(const dfloat a, const dfloat b, const dfloat c,
+                                        const int i, const int j, const int k,
+                                        dfloat& Pr, dfloat& Ps, dfloat& Pt);
+  static void VandermondeHex3D(const int _N,
+                               const memory<dfloat> _r,
+                               const memory<dfloat> _s,
+                               const memory<dfloat> _t,
+                               memory<dfloat>& V);
+  static void GradVandermondeHex3D(const int _N,
+                                   const memory<dfloat> _r,
+                                   const memory<dfloat> _s,
+                                   const memory<dfloat> _t,
+                                   memory<dfloat>& Vr,
+                                   memory<dfloat>& Vs,
+                                   memory<dfloat>& Vt);
+  static void MassMatrixHex3D(const int _Np,
+                              const memory<dfloat> V,
+                              memory<dfloat>& _MM);
+  static void LumpedMassMatrixHex3D(const int _N,
+                                    const memory<dfloat> _gllw,
+                                    memory<dfloat>& _MM);
+  static void invLumpedMassMatrixHex3D(const int _N,
+                                       const memory<dfloat> _gllw,
+                                       memory<dfloat>& _invMM);
+  static void DmatrixHex3D(const int _N,
+                           const memory<dfloat> _r,
+                           const memory<dfloat> _s,
+                           const memory<dfloat> _t,
+                           memory<dfloat>& _D);
+  static void InterpolationMatrixHex3D(const int _N,
+                                       const memory<dfloat> rIn,
+                                       const memory<dfloat> sIn,
+                                       const memory<dfloat> tIn,
+                                       const memory<dfloat> rOut,
+                                       const memory<dfloat> sOut,
+                                       const memory<dfloat> tOut,
+                                       memory<dfloat>& I);
 };
 
+} //namespace libp
+
 #endif
 
diff --git a/include/mesh/mesh2D.hpp b/include/mesh/mesh2D.hpp
deleted file mode 100644
index 1ced54f08..000000000
--- a/include/mesh/mesh2D.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#ifndef MESH2D_HPP
-#define MESH2D_HPP 1
-
-#include "meshDefines2D.h"
-
-class mesh2D: public mesh_t {
-public:
-  mesh2D(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm);
-
-  // repartition elements in parallel
-  void GeometricPartition();
-
-  // serial face-node to face-node connection
-  void ConnectFaceNodes();
-
-  // setup occa buffers
-  virtual void OccaSetup();
-
-  // print out mesh partition in parallel
-  void PrintVTU(const char *fileName);
-};
-
-class meshTri2D: public mesh2D {
-public:
-  meshTri2D(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm);
-  void ParallelReader(const char *fileName);
-  void SetupBox();
-  void SetupPmlBox();
-  void ReferenceNodes(int N);
-  void PhysicalNodes();
-  void GeometricFactors();
-  void SurfaceGeometricFactors();
-  void OccaSetup();
-
-  void CubatureSetup();
-  void CubatureNodes();
-
-  void MassMatrixKernelSetup(int Nfields);
-
-  void PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch=nullptr);
-
-  dfloat ElementCharacteristicLength(dlong e);
-};
-
-class meshQuad2D: public mesh2D {
-public:
-  meshQuad2D(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm);
-  void ParallelReader(const char *fileName);
-  void SetupBox();
-  void SetupPmlBox();
-  void ReferenceNodes(int N);
-  void PhysicalNodes();
-  void GeometricFactors();
-  void SurfaceGeometricFactors();
-  void OccaSetup();
-
-  void CubatureSetup();
-  void CubatureNodes();
-
-  void MassMatrixKernelSetup(int Nfields);
-
-  void PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch=nullptr);
-
-  dfloat ElementCharacteristicLength(dlong e);
-};
-
-#endif
-
diff --git a/include/mesh/mesh3D.hpp b/include/mesh/mesh3D.hpp
deleted file mode 100644
index 8eb4a4814..000000000
--- a/include/mesh/mesh3D.hpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#ifndef MESH3D_HPP
-#define MESH3D_HPP 1
-
-#include "meshDefines3D.h"
-
-class mesh3D: public mesh_t {
-public:
-  mesh3D(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm);
-
-  // repartition elements in parallel
-  void GeometricPartition();
-
-  // serial face-node to face-node connection
-  void ConnectFaceNodes();
-
-  inline
-  void ConnectFaceModes(int *faceModes, dfloat *V) {}; //not implemented yet
-
-  // setup occa buffers
-  virtual void OccaSetup();
-
-  // print out mesh partition in parallel
-  void PrintVTU(const char *fileName);
-};
-
-class meshTri3D: public mesh3D {
-public:
-  meshTri3D(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm);
-  void ParallelReader(const char *fileName);
-  void SetupBox();
-  void SetupPmlBox();
-  void ReferenceNodes(int N);
-  void PhysicalNodes();
-  void GeometricFactors();
-  void SurfaceGeometricFactors();
-  void OccaSetup();
-
-  void CubatureSetup();
-  void CubatureNodes();
-
-  void MassMatrixKernelSetup(int Nfields);
-
-  void PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch=nullptr);
-
-  dfloat ElementCharacteristicLength(dlong e);
-};
-
-class meshQuad3D: public mesh3D {
-public:
-  meshQuad3D(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm);
-  void ParallelReader(const char *fileName);
-  void SetupBox();
-  void SetupPmlBox();
-  void ReferenceNodes(int N);
-  void PhysicalNodes();
-  void GeometricFactors();
-  void SurfaceGeometricFactors();
-  void OccaSetup();
-
-  void CubatureSetup();
-  void CubatureNodes();
-
-  void MassMatrixKernelSetup(int Nfields);
-
-  void PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch=nullptr);
-
-  dfloat ElementCharacteristicLength(dlong e);
-};
-
-class meshTet3D: public mesh3D {
-public:
-  meshTet3D(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm);
-  void ParallelReader(const char *fileName);
-  void SetupBox();
-  void SetupPmlBox();
-  void ReferenceNodes(int N);
-  void PhysicalNodes();
-  void GeometricFactors();
-  void SurfaceGeometricFactors();
-  void OccaSetup();
-
-  void CubatureSetup();
-  void CubatureNodes();
-
-  void MassMatrixKernelSetup(int Nfields);
-
-  void PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch=nullptr);
-
-  dfloat ElementCharacteristicLength(dlong e);
-};
-
-class meshHex3D: public mesh3D {
-public:
-  meshHex3D(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm);
-  void ParallelReader(const char *fileName);
-  void SetupBox();
-  void SetupPmlBox();
-  void ReferenceNodes(int N);
-  void PhysicalNodes();
-  void GeometricFactors();
-  void SurfaceGeometricFactors();
-  void OccaSetup();
-
-  void CubatureSetup();
-  void CubatureNodes();
-
-  void MassMatrixKernelSetup(int Nfields);
-
-  void PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch=nullptr);
-
-  dfloat ElementCharacteristicLength(dlong e);
-};
-
-#endif
-
diff --git a/include/mesh/meshDefines3D.h b/include/mesh/meshDefines3D.h
deleted file mode 100644
index e254b22d3..000000000
--- a/include/mesh/meshDefines3D.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#ifndef MESH_DEFINES3D_H
-#define MESH_DEFINES3D_H 1
-
-/* offsets for geometric factors */
-#define RXID 0
-#define RYID 1
-#define SXID 2
-#define SYID 3
-#define  JID 4
-#define JWID 5
-#define IJWID 6
-#define RZID 7
-#define SZID 8
-#define TXID 9
-#define TYID 10
-#define TZID 11
-
-/* offsets for second order geometric factors */
-#define G00ID 0
-#define G01ID 1
-#define G11ID 2
-#define GWJID 3
-#define G12ID 4
-#define G02ID 5
-#define G22ID 6
-
-
-/* offsets for nx, ny, sJ, 1/J */
-#define NXID 0
-#define NYID 1
-#define SJID 2
-#define IJID 3
-#define IHID 4
-#define WSJID 5
-#define WIJID 6
-#define NZID 7
-#define STXID 8
-#define STYID 9
-#define STZID 10
-#define SBXID 11
-#define SBYID 12
-#define SBZID 13
-#define SURXID 14
-#define SURYID 15
-#define SURZID 16
-
-// //offsets for boltzmann PML variables
-// #define QXID1 0
-// #define QXID2 1
-// #define QXID3 2
-// #define QXID4 3
-// #define QXID5 4
-// #define QXID6 5
-// #define QXID8 6
-// //
-// #define QYID1 7
-// #define QYID2 8
-// #define QYID3 9
-// #define QYID4 10
-// #define QYID5 11
-// #define QYID7 12
-// #define QYID9 13
-// //
-// #define QZID1 14
-// #define QZID2 15
-// #define QZID3 16
-// #define QZID4 17
-// #define QZID6 18
-// #define QZID7 19
-// #define QZID10  20
-
-#endif
-
diff --git a/include/ogs.hpp b/include/ogs.hpp
old mode 100644
new mode 100755
index df81c7c91..e53c7408f
--- a/include/ogs.hpp
+++ b/include/ogs.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -29,13 +29,15 @@ SOFTWARE.
 
   The code
 
-    MPI_Comm comm;
-  	dlong N;
-    hlong id[N];    // the hlong and dlong types are defined in "types.h"
-    int verbose;
-    occa::device device
+    comm_t comm;
+    dlong N;
+    memory<hlong> id(N);    // the hlong and dlong types are defined in "types.h"
+    bool verbose;
+    bool unique;
+    ogs_t ogs(platform);
     ...
-    ogs_t *ogs = ogs_t::Setup(N, id, comm, verbose, device);
+    ogs.Setup(N, id, comm, ogs::Signed,
+              ogs::Auto, unique, verbose);
 
   defines a partition of the set of (processor, local index) pairs,
     (p,i) \in S_j  iff   abs(id[i]) == j  on processor p
@@ -43,20 +45,21 @@ SOFTWARE.
     same id (=j).
   S_0 is treated specially --- it is ignored completely
     (i.e., when id[i] == 0, local index i does not participate in any
-    gather/scatter operation
+    gather/scatter operation)
   If id[i] on proc p is negative then the pair (p,i) is "flagged". This
   determines the non-symmetric behavior. For the simpler, symmetric case,
-  all id's should be positive.
+  ogs::Unsigned can be passed to the 'Kind' parameter, which
+  treats all id's as positive.
 
   When "ogs" is no longer needed, free it with
 
-    ogsFree(ogs);
+    ogs.Free();
 
   A basic gatherScatter operation is, e.g.,
 
-    occa::memory o_v;
+    deviceMemory<double> o_v;
     ...
-    ogs->GatherScatter(o_v, ogs_double, ogs_add, ogs_sym);
+    ogs.GatherScatter(o_v, 1, ogs::Add, ogs::Sym);
 
   This gs call has the effect,
 
@@ -70,92 +73,65 @@ SOFTWARE.
 
   Summation on doubles is not the only operation and datatype supported. Support
   includes the operations
-    ogs_add, ogs_mul, ogs_max, ogs_min
-  and datatypes
-    ogs_dfloat, ogs_double, ogs_float, ogs_int, ogs_longlong, ogs_dlong, ogs_hlong.
+    ogs::Add, ogs::Mul, ogs::Max, ogs::Min
+  and datatypes: float, double, int, long long int.
 
-  For the nonsymmetric behavior, the "transpose" parameter is important:
+  For the nonsymmetric behavior, the "Transpose" parameter is important:
 
-    ogs->GatherScatter(o_v, ogs_double, ogs_add, [ogs_notrans/ogs_trans/ogs_sym]);
+    ogs.GatherScatter(o_v, 1, ogs::Add, ogs::[NoTrans/Trans/Sym]);
 
-  When transpose == ogs_notrans, any "flagged" (p,i) pairs (id[i] negative on p)
+  When transpose == ogs::NoTrans, any "flagged" (p,i) pairs (id[i] negative on p)
   do not participate in the sum, but *do* still receive the sum on output.
   As a special case, when only one (p,i) pair is unflagged per group this
   corresponds to the rectangular "Q" matrix referred to above.
 
-  When transpose == ogs_trans, the "flagged" (p,i) pairs *do* participate in the sum,
+  When transpose == ogs::Trans, the "flagged" (p,i) pairs *do* participate in the sum,
   but do *not* get set on output. In the special case of only one unflagged
   (p,i) pair, this corresponds to the transpose of "Q" referred to above.
 
-  When transpose == ogs_sym, all ids are considered "unflagged". That is,
+  When transpose == ogs::Sym, all ids are considered "unflagged". That is,
   the "flagged" (p,i) pairs *do* participate in the sum, and *do* get set
   on output.
 
-  An additional nonsymmetric operation is
+  When the 'unique' parameter is passed as 'true', the setup call modifies ids,
+  "flagging" (by negating id[i]) all (p,i) pairs in each group except one.
+  The sole "unflagged" member of the group is chosen in an arbitrary but
+  consistent way. When all groups of (p,i) pairs have a single "unflagged"
+  pair in this mannor, an additional nonsymmetric operation is available:
 
-    ogs->Gather(o_Gv, o_v, ogs_double, ogs_add, ogs_notrans);
+    ogs.Gather(o_Gv, o_v, 1, ogs::Add, ogs::Trans);
 
   this has the effect of "assembling" the vector o_Gv. That is
 
     o_Gv[gid[j]] <--  \sum_{ (p,j) \in S_{id[i]} } o_v_(p) [j]
 
   for some ordering gid. As with the GatherScatter operation, when
-  transpose == ogs_notrans, any "flagged" (p,i) pairs (id[i] negative on p)
-  do not participate in the sum, whereas when transpose == ogs_trans the "flagged"
-  (p,i) pairs *do* participate in the sum. Using transpose == ogs_sym is not
-  supported (the symmetrized version of this operation is just GatherScatter).
+  Transpose == ogs::NoTrans, any "flagged" (p,i) pairs (id[i] negative on p)
+  do not participate in the sum, otherwise the "flagged" (p,i) pairs *do*
+  participate in the sum.
 
-  The reverse of this operation is
+  The inverse of this operation is
 
-    ogs->Scatter(o_v, o_Gv, ogs_double, ogs_add, ogs_notrans);
+    ogs.Scatter(o_v, o_Gv, 1, ogs::Add, ogs::Trans);
 
   which has the effect of scattering in the assembled entries in o_Gv back to the
-  orginal ordering. When transpose == ogs_notrans, "flagged" (p,i) pairs (id[i]
-  negative on p) recieve their corresponding entry from o_Gv, and when
-  transpose == ogs_trans the "flagged" (p,i) pairs do *not* recieve an entry.
-  Using transpose == ogs_sym is not supported.
+  orginal ordering. When Transpose == ogs::Trans, "flagged" (p,i) pairs (id[i]
+  negative on p) do *not* recieve their corresponding entry from o_Gv, otherwise
+  the "flagged" (p,i) pairs recieve an entry.
 
-  A versions for vectors (contiguously packed) is, e.g.,
+  For operating on contiguously packed vectors, the K parameter is used, e.g.,
 
-    occa::memory o_v;
-    ogs->GatherScatterVec(o_v, k, ogs_double, ogs_add, ogs_sym);
+    ogs.GatherScatter(o_v, 3, ogs::Add, ogs::Sym);
 
-  which is like "GatherScatter" operating on the datatype double[k],
+  which is like "GatherScatter" operating on the datatype double[3],
   with summation here being vector summation. Number of messages sent
   is independent of k.
 
-  For combining the communication for "GatherScatter" on multiple arrays:
-
-    occa::memory o_v1, o_v2, ..., o_vk;
-
-    ogs->GatherScatterMany(o_v, k, stride, ogs_double, op, trans);
-
-  when the arrays o_v1, o_v2, ..., o_vk are packed in o_v as
-
-    o_v1 = o_v + 0*stride, o_v2 = o_v + 1*stride, ...
-
-  This call is equivalent to
-
-    ogs->GatherScatter(o_v1, ogs_double, op, trans);
-    ogs->GatherScatter(o_v2, ogs_double, op, trans);
-    ...
-    ogs->GatherScatter(o_vk, ogs_double, op, trans);
-
-  except that all communication is done together.
-
-  A utility function, ogs_t::Unique is provided
-
-    ogs_t::Unique(ids, N, comm);
-
-  This call modifies ids, "flagging" (by negating id[i]) all (p,i) pairs in
-  each group except one. The sole "unflagged" member of the group is chosen
-  in an arbitrary but consistent way.
-
   Asynchronous versions of the various GatherScatter functions are provided by
 
-    ogs->GatherScatterStart(o_v, ogs_double, ogs_add, ogs_sym);
+    ogs.GatherScatterStart(o_v, k, ogs::Add, ogs::Sym);
     ...
-    ogs->GatherScatterFinish(o_v, ogs_double, ogs_add, ogs_sym);
+    ogs.GatherScatterFinish(o_v, k, ogs::Add, ogs::Sym);
 
   MPI communication is not initiated in GatherScatterStart, rather some initial
   message packing and host<->device transfers are queued. The user can then queue
@@ -163,260 +139,263 @@ SOFTWARE.
   calling GatherScatterFinish. The MPI communication will then take place while the
   user's local kernels execute to maximize the amount of communication hiding.
 
-  Finally, a thin wrapper of the ogs_t object, named halo_t is provided. This object
-  is intended to provided support for thin halo exchages between MPI procceses.
+  Finally, a specialized communcation object, named halo_t is provided. This
+  object is analogous to an ogs_t object, where each group S_j has a sole
+  "unflagged" (p,i) pair, as discussed above regarding the 'unique' parameter,
+  and furthermore each "unflagged" (p,i) pair has a unique label ids[i] on its
+  process. That is, for each "unflagged" (p,i), there are no other, flagged or
+  unflagged, pairs (p,j) on process p with the label ids[i].
 
-*/
+  With this particular flagging of (p,i) pairs, simple exchange routines are
+  defined:
 
-#ifndef OGS_HPP
-#define OGS_HPP
+    halo_t halo(platofrm);
+    halo.Setup(N, ids, comm, ogs::Auto, verbose);
+    halo.Exchange(o_v, k);
 
-#include "core.hpp"
-#include "platform.hpp"
+  which has the effect of filling all "flagged" pairs (p,i) on all processes with
+  the corresponding value from the unique "unflagged" pair in S_j.
 
-//ogs defs
-#include "ogs/ogsDefs.h"
+  An additional untility operation available in the halo_t object is
 
-/* type enum */
-#define LIST OGS_FOR_EACH_TYPE(ITEM) ogs_type_n
-#define ITEM(T) ogs_##T,
-typedef enum { LIST } ogs_type;
-#undef ITEM
-#undef LIST
+    halo.Combine(o_v, k);
 
-/* operation enum */
-#define LIST OGS_FOR_EACH_OP(T,ITEM) ogs_op_n
-#define ITEM(T,op) ogs_##op,
-typedef enum { LIST } ogs_op;
-#undef ITEM
-#undef LIST
-
-/* transpose switch */
-typedef enum { ogs_sym, ogs_notrans, ogs_trans } ogs_transpose;
+  which has the effect of summing the entries in S_j and writing the result to
+  the sole "unflagged" pair in S_j.
 
-class ogsData_t {
-public:
-  dlong Nrows=0;
-  dlong nnz=0;
-  dlong NrowBlocks=0;
-
-  dlong *blockRowStarts=nullptr;
-  dlong *rowStarts=nullptr;
-  dlong *colIds=nullptr;
-
-  occa::memory o_blockRowStarts;
-  occa::memory o_rowStarts;
-  occa::memory o_colIds;
-
-  ogsData_t() {};
-
-  ~ogsData_t() {
-    if(blockRowStarts) {free(blockRowStarts); blockRowStarts=nullptr;}
-    if(rowStarts) {free(rowStarts); rowStarts=nullptr;}
-    if(colIds) {free(colIds); colIds=nullptr;}
-    o_blockRowStarts.free();
-    o_rowStarts.free();
-    o_colIds.free();
-  }
-};
+*/
 
-// OCCA+gslib gather scatter
-class ogs_t {
-public:
-  platform_t& platform;
-  MPI_Comm comm;
+#ifndef OGS_HPP
+#define OGS_HPP
 
-  dlong         N=0;
-  dlong         Nlocal=0;         //  number of local nodes
-  dlong         Nhalo=0;          //  number of halo nodes
+#include "core.hpp"
+#include "platform.hpp"
 
-  dlong         Ngather=0;        //  total number of gather nodes
-  dlong         NgatherHalo=0;    //  number of halo nodes for gathered vector
-  hlong         NgatherGlobal=0;  //  global number of gather nodes
+namespace libp {
 
-  ogsData_t localGather, localScatter;
-  ogsData_t haloGather, haloScatter;
+namespace ogs {
 
-  ogsData_t fusedGather, fusedScatter;
-  ogsData_t symGatherScatter;
+/* type enum */
+typedef enum { Float, Double, Int32, Int64} Type;
 
-  void *gsh=nullptr;       // gslib handle
-  void *gshSym=nullptr;    // Symmetrized gslib handle (all ids made positive)
+constexpr Type Dfloat = (std::is_same<double, dfloat>::value)
+                          ? Double : Float;
+// constexpr Type Pfloat = (std::is_same<double, pfloat>::value)
+//                           ? Double : Float;
+constexpr Type Dlong  = (std::is_same<int32_t, dlong>::value)
+                          ? Int32 : Int64;
+constexpr Type Hlong  = (std::is_same<int32_t, hlong>::value)
+                          ? Int32 : Int64;
 
-  void* hostBuf=nullptr;
-  size_t hostBufSize=0;
+/* operation enum */
+typedef enum { Add, Mul, Max, Min} Op;
 
-  void* haloBuf=nullptr;
-  occa::memory o_haloBuf;
-  occa::memory h_haloBuf;
+/* transpose switch */
+typedef enum { Sym, NoTrans, Trans } Transpose;
 
-  dlong *GlobalToLocal;
-  occa::memory o_GlobalToLocal;
+/* method switch */
+typedef enum { Auto, Pairwise, CrystalRouter, AllToAll} Method;
 
-  ogs_t(platform_t& _platform, MPI_Comm _comm):
-    platform(_platform), comm(_comm) {};
+/* kind enum */
+typedef enum { Unsigned, Signed, Halo} Kind;
 
-  void Free();
+} //namespace ogs
 
-  static ogs_t *Setup(dlong N, hlong *ids, MPI_Comm &comm,
-                      int verbose, platform_t& platform);
+} //namespace libp
 
-  static void Unique(hlong *ids, dlong _N, MPI_Comm _comm);
+#include "ogs/ogsBase.hpp"
 
-  // Host buffer versions
-  void GatherScatter    (void  *v,
-                         const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void GatherScatterVec (void  *v, const int k,
-                         const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void GatherScatterMany(void  *v, const int k, const dlong stride,
-                         const ogs_type type, const ogs_op op, const ogs_transpose trans);
+namespace libp {
 
-  void Gather    (void  *gv, void  *v,
-                  const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void GatherVec (void  *gv, void  *v, const int k,
-                  const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void GatherMany(void  *gv, void  *v, const int k,
-                  const dlong gstride, const dlong stride,
-                  const ogs_type type, const ogs_op op, const ogs_transpose trans);
+namespace ogs {
 
-  void Scatter    (void  *v, void  *gv,
-                   const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void ScatterVec (void  *v, void  *gv, const int k,
-                   const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void ScatterMany(void  *v, void  *gv, const int k,
-                   const dlong stride, const dlong gstride,
-                   const ogs_type type, const ogs_op op, const ogs_transpose trans);
+//pre-build kernels
+void InitializeKernels(platform_t& platform, const Type type, const Op op);
 
+// OCCA Gather Scatter
+class ogs_t : public ogsBase_t {
+public:
+  ogs_t()=default;
+  ~ogs_t()=default;
+
+  void Setup(const dlong _N,
+             memory<hlong> ids,
+             comm_t _comm,
+             const Kind _kind,
+             const Method method,
+             const bool _unique,
+             const bool verbose,
+             platform_t& _platform);
+
+  void SetupGlobalToLocalMapping(memory<dlong> GlobalToLocal);
+
+  // Synchronous host versions
+  template<typename T>
+  void GatherScatter(memory<T> v,
+                     const int k,
+                     const Op op,
+                     const Transpose trans);
+  // Asynchronous host buffer versions
+  template<typename T>
+  void GatherScatterStart (memory<T> v,
+                           const int k,
+                           const Op op,
+                           const Transpose trans);
+  template<typename T>
+  void GatherScatterFinish(memory<T> v,
+                           const int k,
+                           const Op op,
+                           const Transpose trans);
   // Synchronous device buffer versions
-  void GatherScatter    (occa::memory&  o_v,
-                         const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void GatherScatterVec (occa::memory&  o_v, const int k,
-                         const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void GatherScatterMany(occa::memory&  o_v, const int k,
-                         const dlong stride,
-                         const ogs_type type, const ogs_op op, const ogs_transpose trans);
-
-  void Gather    (occa::memory&  o_gv, occa::memory&  o_v,
-                  const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void GatherVec (occa::memory&  o_gv, occa::memory&  o_v, const int k,
-                  const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void GatherMany(occa::memory&  o_gv, occa::memory&  o_v, const int k,
-                  const dlong gstride, const dlong stride,
-                  const ogs_type type, const ogs_op op, const ogs_transpose trans);
-
-  void Scatter    (occa::memory&  o_v, occa::memory&  o_gv,
-                   const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void ScatterVec (occa::memory&  o_v, occa::memory&  o_gv, const int k,
-                   const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void ScatterMany(occa::memory&  o_v, occa::memory&  o_gv, const int k,
-                   const dlong stride, const dlong gstride,
-                   const ogs_type type, const ogs_op op, const ogs_transpose trans);
-
+  template<typename T>
+  void GatherScatter(deviceMemory<T> o_v,
+                     const int k,
+                     const Op op,
+                     const Transpose trans);
   // Asynchronous device buffer versions
-  void GatherScatterStart     (occa::memory&  o_v,
-                               const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void GatherScatterFinish    (occa::memory&  o_v,
-                               const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void GatherScatterVecStart  (occa::memory&  o_v, const int k,
-                               const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void GatherScatterVecFinish (occa::memory&  o_v, const int k,
-                               const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void GatherScatterManyStart (occa::memory&  o_v, const int k, const dlong stride,
-                               const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void GatherScatterManyFinish(occa::memory&  o_v, const int k, const dlong stride,
-                               const ogs_type type, const ogs_op op, const ogs_transpose trans);
-
-  void GatherStart     (occa::memory&  o_gv, occa::memory&  o_v,
-                        const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void GatherFinish    (occa::memory&  o_gv, occa::memory&  o_v,
-                        const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void GatherVecStart  (occa::memory&  o_gv, occa::memory&  o_v, const int k,
-                        const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void GatherVecFinish (occa::memory&  o_gv, occa::memory&  o_v, const int k,
-                        const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void GatherManyStart (occa::memory&  o_gv, occa::memory&  o_v, const int k,
-                        const dlong gstride, const dlong stride,
-                        const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void GatherManyFinish(occa::memory&  o_gv, occa::memory&  o_v, const int k,
-                        const dlong gstride, const dlong stride,
-                        const ogs_type type, const ogs_op op, const ogs_transpose trans);
-
-  void ScatterStart     (occa::memory&  o_v, occa::memory&  o_gv,
-                         const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void ScatterFinish    (occa::memory&  o_v, occa::memory&  o_gv,
-                         const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void ScatterVecStart  (occa::memory&  o_v, occa::memory&  o_gv, const int k,
-                         const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void ScatterVecFinish (occa::memory&  o_v, occa::memory&  o_gv, const int k,
-                         const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void ScatterManyStart (occa::memory&  o_v, occa::memory&  o_gv, const int k,
-                         const dlong stride, const dlong gstride,
-                         const ogs_type type, const ogs_op op, const ogs_transpose trans);
-  void ScatterManyFinish(occa::memory&  o_v, occa::memory&  o_gv, const int k,
-                         const dlong stride, const dlong gstride,
-                         const ogs_type type, const ogs_op op, const ogs_transpose trans);
-
-  void GatheredHaloExchangeSetup();
-  void GatheredHaloExchangeStart(occa::memory& o_v,
-                                 const int k,
-                                 const ogs_type type);
-  void GatheredHaloExchangeFinish(occa::memory& o_v,
-                                 const int k,
-                                 const ogs_type type);
-
-  void reallocHostBuffer(size_t Nbytes);
-  void reallocOccaBuffer(size_t Nbytes);
+  template<typename T>
+  void GatherScatterStart (deviceMemory<T> o_v,
+                           const int k,
+                           const Op op,
+                           const Transpose trans);
+  template<typename T>
+  void GatherScatterFinish(deviceMemory<T> o_v,
+                           const int k,
+                           const Op op,
+                           const Transpose trans);
+
+  // Synchronous host versions
+  template<typename T>
+  void Gather(memory<T> gv,
+              const memory<T> v,
+              const int k,
+              const Op op,
+              const Transpose trans);
+  // Asynchronous host buffer versions
+  template<typename T>
+  void GatherStart (memory<T> gv,
+                    const memory<T> v,
+                    const int k,
+                    const Op op,
+                    const Transpose trans);
+  template<typename T>
+  void GatherFinish(memory<T> gv,
+                    const memory<T> v,
+                    const int k,
+                    const Op op,
+                    const Transpose trans);
+  // Synchronous device buffer versions
+  template<typename T>
+  void Gather(deviceMemory<T> o_gv,
+              deviceMemory<T> o_v,
+              const int k,
+              const Op op,
+              const Transpose trans);
+  // Asynchronous device buffer versions
+  template<typename T>
+  void GatherStart (deviceMemory<T> o_gv,
+                    deviceMemory<T> o_v,
+                    const int k,
+                    const Op op,
+                    const Transpose trans);
+  template<typename T>
+  void GatherFinish(deviceMemory<T> o_gv,
+                    deviceMemory<T> o_v,
+                    const int k,
+                    const Op op,
+                    const Transpose trans);
+
+  // Synchronous host versions
+  template<typename T>
+  void Scatter(memory<T> v,
+               const memory<T> gv,
+               const int k,
+               const Transpose trans);
+  // Asynchronous host buffer versions
+  template<typename T>
+  void ScatterStart (memory<T> v,
+                     const memory<T> gv,
+                     const int k,
+                     const Transpose trans);
+  template<typename T>
+  void ScatterFinish(memory<T> v,
+                     memory<T> gv,
+                     const int k,
+                     const Transpose trans);
+  // Synchronous device buffer versions
+  template<typename T>
+  void Scatter(deviceMemory<T> o_v,
+               deviceMemory<T> o_gv,
+               const int k,
+               const Transpose trans);
+  // Asynchronous device buffer versions
+  template<typename T>
+  void ScatterStart (deviceMemory<T> o_v,
+                     deviceMemory<T> o_gv,
+                     const int k,
+                     const Transpose trans);
+  template<typename T>
+  void ScatterFinish(deviceMemory<T> o_v,
+                     deviceMemory<T> o_gv,
+                     const int k,
+                     const Transpose trans);
+
+  friend class halo_t;
 };
 
-// OCCA halo exchange (thin wrapper of an ogs_t object)
-class halo_t {
+// OCCA Halo
+class halo_t : public ogsBase_t {
 public:
-  ogs_t* ogs;
-
-  void Free() { if (ogs) { ogs->Free(); ogs=nullptr; } }
-
-  static halo_t *Setup(dlong N, hlong *ids, MPI_Comm &comm,
-                       int verbose, platform_t& platform) {
-    halo_t *halo = new halo_t();
-    halo->ogs = ogs_t::Setup(N, ids, comm, verbose, platform);
-    return halo;
-  }
-
-  // Synchronous Host buffer version
-  void Exchange(void  *v, const int k, const ogs_type type) {
-    ogs->GatherScatterVec(v, k, type, ogs_add, ogs_notrans);
-  }
-
+  halo_t()=default;
+  ~halo_t()=default;
+
+  bool gathered_halo=false;
+  dlong Nhalo=0;
+
+  void Setup(const dlong _N,
+             memory<hlong> ids,
+             comm_t _comm,
+             const Method method,
+             const bool verbose,
+             platform_t& _platform);
+
+  void SetupFromGather(ogs_t& ogs);
+
+  // Synchronous Host version
+  template<typename T>
+  void Exchange(memory<T> v, const int k);
+  // Asynchronous host version
+  template<typename T>
+  void ExchangeStart (memory<T> v, const int k);
+  template<typename T>
+  void ExchangeFinish(memory<T> v, const int k);
   // Synchronous device buffer version
-  void Exchange(occa::memory &o_v, const int k, const ogs_type type) {
-    ogs->GatherScatterVec(o_v, k, type, ogs_add, ogs_notrans);
-  }
-
+  template<typename T>
+  void Exchange(deviceMemory<T> o_v, const int k);
   // Asynchronous device buffer version
-  void ExchangeStart (occa::memory &o_v, const int k, const ogs_type type) {
-    ogs->GatherScatterVecStart(o_v, k, type, ogs_add, ogs_notrans);
-  }
-  void ExchangeFinish(occa::memory &o_v, const int k, const ogs_type type) {
-    ogs->GatherScatterVecFinish(o_v, k, type, ogs_add, ogs_notrans);
-  }
-
-  // Synchronous Host buffer version
-  void Combine(void  *v, const int k, const ogs_type type) {
-    ogs->GatherScatterVec(v, k, type, ogs_add, ogs_sym);
-  }
-
+  template<typename T>
+  void ExchangeStart (deviceMemory<T> o_v, const int k);
+  template<typename T>
+  void ExchangeFinish(deviceMemory<T> o_v, const int k);
+
+  // Synchronous Host version
+  template<typename T>
+  void Combine(memory<T> v, const int k);
+  // Asynchronous host version
+  template<typename T>
+  void CombineStart (memory<T> v, const int k);
+  template<typename T>
+  void CombineFinish(memory<T> v, const int k);
   // Synchronous device buffer version
-  void Combine(occa::memory &o_v, const int k, const ogs_type type) {
-    ogs->GatherScatterVec(o_v, k, type, ogs_add, ogs_sym);
-  }
-
+  template<typename T>
+  void Combine(deviceMemory<T> o_v, const int k);
   // Asynchronous device buffer version
-  void CombineStart (occa::memory &o_v, const int k, const ogs_type type) {
-    ogs->GatherScatterVecStart(o_v, k, type, ogs_add, ogs_sym);
-  }
-  void CombineFinish(occa::memory &o_v, const int k, const ogs_type type) {
-    ogs->GatherScatterVecFinish(o_v, k, type, ogs_add, ogs_sym);
-  }
+  template<typename T>
+  void CombineStart (deviceMemory<T> o_v, const int k);
+  template<typename T>
+  void CombineFinish(deviceMemory<T> o_v, const int k);
 };
 
+} //namespace ogs
+} //namespace libp
 #endif
diff --git a/include/ogs/ogsBase.hpp b/include/ogs/ogsBase.hpp
new file mode 100644
index 000000000..a0e0355ab
--- /dev/null
+++ b/include/ogs/ogsBase.hpp
@@ -0,0 +1,112 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#ifndef OGSBASE_HPP
+#define OGSBASE_HPP
+
+#include "ogs.hpp"
+
+namespace libp {
+
+namespace ogs {
+
+//forward declarations
+class ogsOperator_t;
+class ogsFusedOperator_t;
+class ogsExchange_t;
+
+struct parallelNode_t;
+
+class halo_t;
+
+class ogsBase_t {
+public:
+  platform_t platform;
+  comm_t comm;
+
+  dlong         N=0;
+  dlong         Ngather=0;        //  total number of local positive gather nodes
+
+  dlong         NlocalT=0;        //  number of local gather nodes
+  dlong         NhaloT=0;         //  number of halo gather nodes
+  dlong         NlocalP=0;        //  number of positive local gather nodes
+  dlong         NhaloP=0;         //  number of positive halo gather nodes
+
+  hlong         NgatherGlobal=0;  //  global number of positive gather nodes
+
+  Kind kind;
+  bool unique=false;
+  bool gather_defined=false;
+
+  static stream_t dataStream;
+
+  ogsBase_t()=default;
+  virtual ~ogsBase_t()=default;
+
+  virtual void Setup(const dlong _N,
+                      memory<hlong> ids,
+                      comm_t _comm,
+                      const Kind _kind,
+                      const Method method,
+                      const bool _unique,
+                      const bool verbose,
+                      platform_t& _platform);
+  void Free();
+
+protected:
+  std::shared_ptr<ogsOperator_t> gatherLocal;
+  std::shared_ptr<ogsOperator_t> gatherHalo;
+  std::shared_ptr<ogsExchange_t> exchange;
+
+  void AssertGatherDefined();
+
+private:
+  void FindSharedNodes(const dlong Nids,
+                       memory<parallelNode_t> &nodes,
+                       const int verbose);
+
+  void ConstructSharedNodes(const dlong Nids,
+                           memory<parallelNode_t> &nodes,
+                           dlong &Nshared,
+                           memory<parallelNode_t> &sharedNodes);
+
+  void LocalSignedSetup(const dlong Nids, memory<parallelNode_t> &nodes);
+  void LocalUnsignedSetup(const dlong Nids, memory<parallelNode_t> &nodes);
+  void LocalHaloSetup(const dlong Nids, memory<parallelNode_t> &nodes);
+
+  ogsExchange_t* AutoSetup(dlong Nshared,
+                           memory<parallelNode_t> &sharedNodes,
+                           ogsOperator_t& gatherHalo,
+                           comm_t _comm,
+                           platform_t &_platform,
+                           const int verbose);
+};
+
+} //namespace ogs
+
+} //namespace libp
+
+#endif
diff --git a/include/ogs/ogsDefs.h b/include/ogs/ogsDefs.h
deleted file mode 100644
index 01e63ba46..000000000
--- a/include/ogs/ogsDefs.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-/* the supported types */
-typedef long long long_long;
-#define OGS_FOR_EACH_TYPE(macro) \
-  macro(double   ) \
-  macro(float    ) \
-  macro(int      ) \
-  macro(long     ) \
-  macro(long_long)
-
-/* the supported ops */
-#define OGS_FOR_EACH_OP(T,macro) \
-  macro(T,add) \
-  macro(T,mul) \
-  macro(T,min) \
-  macro(T,max)
-
-#define OGS_DO_add(a,b) a+=b
-#define OGS_DO_mul(a,b) a*=b
-#define OGS_DO_min(a,b) if(b<a) a=b
-#define OGS_DO_max(a,b) if(b>a) a=b
-
-/* type size array */
-#define OGS_TYPE_SIZE_ITEM(T) sizeof(T),
-#define OGS_DEFINE_TYPE_SIZES() \
-  static const unsigned ogs_type_size[] = \
-    { OGS_FOR_EACH_TYPE(OGS_TYPE_SIZE_ITEM) 0 };
-
-/* mapping from ogs types to gs types */
-#define gs_int64_t gs_long_long
-#define OGS_GS_MAP_TYPE_ITEM(T) gs_##T,
-#define OGS_GS_DEFINE_TYPE_MAP() \
-  static const gs_dom ogs_gs_type_map[] = \
-    { OGS_FOR_EACH_TYPE(OGS_GS_MAP_TYPE_ITEM) gs_dom_n };
-
-/* mapping from ogs ops to gs ops */
-#define OGS_GS_MAP_OP_ITEM(T,OP) gs_##OP,
-#define OGS_GS_DEFINE_OP_MAP() \
-  static const gs_op ogs_gs_op_map[] = \
-    { OGS_FOR_EACH_OP(T,OGS_GS_MAP_OP_ITEM) gs_op_n };
diff --git a/include/ogs/ogsExchange.hpp b/include/ogs/ogsExchange.hpp
new file mode 100644
index 000000000..40f6f6f38
--- /dev/null
+++ b/include/ogs/ogsExchange.hpp
@@ -0,0 +1,336 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#ifndef OGS_EXCHANGE_HPP
+#define OGS_EXCHANGE_HPP
+
+#include "ogs.hpp"
+#include "ogs/ogsOperator.hpp"
+
+namespace libp {
+
+namespace ogs {
+
+//virtual base class to perform MPI exchange of gatherScatter
+class ogsExchange_t {
+public:
+  platform_t platform;
+  comm_t comm;
+  int rank, size;
+
+  dlong Nhalo, NhaloP;
+
+  pinnedMemory<char> h_workspace, h_sendspace;
+  deviceMemory<char> o_workspace, o_sendspace;
+
+  stream_t dataStream;
+  static kernel_t extractKernel[4];
+
+#ifdef GPU_AWARE_MPI
+  bool gpu_aware=true;
+#else
+  bool gpu_aware=false;
+#endif
+
+  ogsExchange_t(platform_t &_platform, comm_t _comm,
+                stream_t _datastream):
+    platform(_platform),
+    comm(_comm),
+    dataStream(_datastream) {
+    rank = comm.rank();
+    size = comm.size();
+  }
+  virtual ~ogsExchange_t() {}
+
+  virtual void Start(pinnedMemory<float> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Start(pinnedMemory<double> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Start(pinnedMemory<int> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Start(pinnedMemory<long long int> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Finish(pinnedMemory<float> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Finish(pinnedMemory<double> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Finish(pinnedMemory<int> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Finish(pinnedMemory<long long int> &buf,const int k,const Op op,const Transpose trans)=0;
+
+  virtual void Start(deviceMemory<float> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Start(deviceMemory<double> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Start(deviceMemory<int> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Start(deviceMemory<long long int> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Finish(deviceMemory<float> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Finish(deviceMemory<double> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Finish(deviceMemory<int> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Finish(deviceMemory<long long int> &buf,const int k,const Op op,const Transpose trans)=0;
+
+  virtual void AllocBuffer(size_t Nbytes)=0;
+
+  friend void InitializeKernels(platform_t& platform, const Type type, const Op op);
+};
+
+//MPI communcation via single MPI_Alltoallv call
+class ogsAllToAll_t: public ogsExchange_t {
+private:
+
+  dlong NsendN=0, NsendT=0;
+  memory<dlong> sendIdsN, sendIdsT;
+  deviceMemory<dlong> o_sendIdsN, o_sendIdsT;
+
+  ogsOperator_t postmpi;
+
+  memory<int> mpiSendCountsN;
+  memory<int> mpiSendCountsT;
+  memory<int> mpiRecvCountsN;
+  memory<int> mpiRecvCountsT;
+  memory<int> mpiSendOffsetsN;
+  memory<int> mpiSendOffsetsT;
+  memory<int> mpiRecvOffsetsN;
+  memory<int> mpiRecvOffsetsT;
+
+  memory<int> sendCounts;
+  memory<int> recvCounts;
+  memory<int> sendOffsets;
+  memory<int> recvOffsets;
+
+  Comm::request_t request;
+
+public:
+  ogsAllToAll_t(dlong Nshared,
+               memory<parallelNode_t> &sharedNodes,
+               ogsOperator_t &gatherHalo,
+               stream_t _dataStream,
+               comm_t _comm,
+               platform_t &_platform);
+
+  template<typename T>
+  void Start(pinnedMemory<T> &buf,
+                const int k,
+                const Op op,
+                const Transpose trans);
+
+  template<typename T>
+  void Finish(pinnedMemory<T> &buf,
+                const int k,
+                const Op op,
+                const Transpose trans);
+
+  virtual void Start(pinnedMemory<float> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(pinnedMemory<double> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(pinnedMemory<int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(pinnedMemory<long long int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(pinnedMemory<float> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(pinnedMemory<double> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(pinnedMemory<int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(pinnedMemory<long long int> &buf,const int k,const Op op,const Transpose trans);
+
+  template<typename T>
+  void Start(deviceMemory<T> &buf,
+                const int k,
+                const Op op,
+                const Transpose trans);
+
+  template<typename T>
+  void Finish(deviceMemory<T> &buf,
+                const int k,
+                const Op op,
+                const Transpose trans);
+
+  virtual void Start(deviceMemory<float> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(deviceMemory<double> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(deviceMemory<int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(deviceMemory<long long int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(deviceMemory<float> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(deviceMemory<double> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(deviceMemory<int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(deviceMemory<long long int> &buf,const int k,const Op op,const Transpose trans);
+
+  virtual void AllocBuffer(size_t Nbytes);
+
+};
+
+//MPI communcation via pairwise send/recvs
+class ogsPairwise_t: public ogsExchange_t {
+private:
+
+  dlong NsendN=0, NsendT=0;
+  memory<dlong> sendIdsN, sendIdsT;
+  deviceMemory<dlong> o_sendIdsN, o_sendIdsT;
+
+  ogsOperator_t postmpi;
+
+  int NranksSendN=0, NranksRecvN=0;
+  int NranksSendT=0, NranksRecvT=0;
+  memory<int> sendRanksN;
+  memory<int> sendRanksT;
+  memory<int> recvRanksN;
+  memory<int> recvRanksT;
+  memory<int> sendCountsN;
+  memory<int> sendCountsT;
+  memory<int> recvCountsN;
+  memory<int> recvCountsT;
+  memory<int> sendOffsetsN;
+  memory<int> sendOffsetsT;
+  memory<int> recvOffsetsN;
+  memory<int> recvOffsetsT;
+  memory<Comm::request_t> requests;
+
+public:
+  ogsPairwise_t(dlong Nshared,
+               memory<parallelNode_t> &sharedNodes,
+               ogsOperator_t &gatherHalo,
+               stream_t _dataStream,
+               comm_t _comm,
+               platform_t &_platform);
+
+  template<typename T>
+  void Start(pinnedMemory<T> &buf,
+                const int k,
+                const Op op,
+                const Transpose trans);
+
+  template<typename T>
+  void Finish(pinnedMemory<T> &buf,
+                const int k,
+                const Op op,
+                const Transpose trans);
+
+  virtual void Start(pinnedMemory<float> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(pinnedMemory<double> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(pinnedMemory<int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(pinnedMemory<long long int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(pinnedMemory<float> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(pinnedMemory<double> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(pinnedMemory<int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(pinnedMemory<long long int> &buf,const int k,const Op op,const Transpose trans);
+
+  template<typename T>
+  void Start(deviceMemory<T> &buf,
+                const int k,
+                const Op op,
+                const Transpose trans);
+
+  template<typename T>
+  void Finish(deviceMemory<T> &buf,
+                const int k,
+                const Op op,
+                const Transpose trans);
+
+  virtual void Start(deviceMemory<float> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(deviceMemory<double> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(deviceMemory<int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(deviceMemory<long long int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(deviceMemory<float> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(deviceMemory<double> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(deviceMemory<int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(deviceMemory<long long int> &buf,const int k,const Op op,const Transpose trans);
+
+  virtual void AllocBuffer(size_t Nbytes);
+};
+
+//MPI communcation via Crystal Router
+class ogsCrystalRouter_t: public ogsExchange_t {
+private:
+
+  struct crLevel {
+    int Nmsg;
+    int partner;
+
+    int Nsend, Nrecv0, Nrecv1;
+    dlong recvOffset;
+
+    memory<dlong> sendIds;
+    deviceMemory<dlong> o_sendIds;
+
+    ogsOperator_t gather;
+  };
+
+  int buf_id=0, hbuf_id=0;
+  pinnedMemory<char> h_work[2];
+  deviceMemory<char> o_work[2];
+
+  memory<Comm::request_t> request;
+
+  int Nlevels=0;
+  memory<crLevel> levelsN;
+  memory<crLevel> levelsT;
+
+  int NsendMax=0, NrecvMax=0;
+
+public:
+  ogsCrystalRouter_t(dlong Nshared,
+                   memory<parallelNode_t> &sharedNodes,
+                   ogsOperator_t &gatherHalo,
+                   stream_t _dataStream,
+                   comm_t _comm,
+                   platform_t &_platform);
+
+  template<typename T>
+  void Start(pinnedMemory<T> &buf,
+                const int k,
+                const Op op,
+                const Transpose trans);
+
+  template<typename T>
+  void Finish(pinnedMemory<T> &buf,
+                const int k,
+                const Op op,
+                const Transpose trans);
+
+  virtual void Start(pinnedMemory<float> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(pinnedMemory<double> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(pinnedMemory<int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(pinnedMemory<long long int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(pinnedMemory<float> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(pinnedMemory<double> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(pinnedMemory<int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(pinnedMemory<long long int> &buf,const int k,const Op op,const Transpose trans);
+
+  template<typename T>
+  void Start(deviceMemory<T> &buf,
+                const int k,
+                const Op op,
+                const Transpose trans);
+
+  template<typename T>
+  void Finish(deviceMemory<T> &buf,
+                const int k,
+                const Op op,
+                const Transpose trans);
+
+  virtual void Start(deviceMemory<float> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(deviceMemory<double> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(deviceMemory<int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(deviceMemory<long long int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(deviceMemory<float> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(deviceMemory<double> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(deviceMemory<int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(deviceMemory<long long int> &buf,const int k,const Op op,const Transpose trans);
+
+  virtual void AllocBuffer(size_t Nbytes);
+};
+
+} //namespace ogs
+
+} //namespace libp
+
+#endif
diff --git a/include/ogs/ogsKernels.hpp b/include/ogs/ogsKernels.hpp
deleted file mode 100644
index cfab2f207..000000000
--- a/include/ogs/ogsKernels.hpp
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#ifndef OGS_KERNELS_HPP
-#define OGS_KERNELS_HPP
-
-#include <limits>
-#include "ogs.hpp"
-#include "ogsDefs.h"
-
-#define DEFINE_ADD_OGS_INIT(T)                                  \
-  static T init_##T##_add = (T)  0;                             \
-  static T init_##T##_mul = (T)  1;                             \
-  static T init_##T##_min = (T)  std::numeric_limits<T>::max(); \
-  static T init_##T##_max = (T) -std::numeric_limits<T>::max();
-
-class ogsData_t;
-
-namespace ogs {
-
-extern const int blockSize;
-extern const int gatherNodesPerBlock;
-
-extern int Nrefs;
-
-extern occa::stream dataStream;
-
-void initKernels(platform_t& platform);
-
-void freeKernels();
-
-//Setup a gslib struct
-void *gsSetup(MPI_Comm meshComm,
-              dlong NuniqueBases,
-              hlong *gatherGlobalNodes,
-              int nonsymm, int verbose);
-
-void gsUnique(hlong *gatherGlobalNodes,
-              dlong NuniqueBases,
-              MPI_Comm meshComm);
-
-void gsFree(void* gs);
-
-#define DEFINE_GATHERSCATTER_KERNEL(T,OP) \
-  extern occa::kernel gatherScatterKernel_##T##_##OP;
-
-#define DEFINE_GATHER_KERNEL(T,OP) \
-  extern occa::kernel gatherKernel_##T##_##OP;
-
-#define DEFINE_SCATTER_KERNEL(T) \
-  extern occa::kernel scatterKernel_##T;
-
-#define DEFINE_KERNELS(T)                        \
-  OGS_FOR_EACH_OP(T,DEFINE_GATHERSCATTER_KERNEL) \
-  OGS_FOR_EACH_OP(T,DEFINE_GATHER_KERNEL)        \
-  DEFINE_SCATTER_KERNEL(T)
-
-OGS_FOR_EACH_TYPE(DEFINE_KERNELS)
-
-#undef DEFINE_GATHERSCATTER_KERNEL
-#undef DEFINE_GATHER_KERNEL
-#undef DEFINE_SCATTER_KERNEL
-#undef DEFINE_KERNELS
-
-void occaGatherScatterStart(occa::memory& o_v,
-                            const int Nentries, const int Nvectors, const dlong stride,
-                            const ogs_type type, const ogs_op op,
-                            const ogs_transpose trans, ogs_t &ogs);
-void occaGatherScatterFinish(occa::memory& o_v,
-                            const int Nentries, const int Nvectors, const dlong stride,
-                            const ogs_type type, const ogs_op op,
-                            const ogs_transpose trans, ogs_t &ogs);
-
-void occaGatherStart(occa::memory& o_gv, occa::memory& o_v,
-                     const int Nentries, const int Nvectors,
-                     const dlong gstride, const dlong stride,
-                     const ogs_type type, const ogs_op op,
-                     const ogs_transpose trans, ogs_t &ogs);
-void occaGatherFinish(occa::memory& o_gv, occa::memory& o_v,
-                     const int Nentries, const int Nvectors,
-                     const dlong gstride, const dlong stride,
-                     const ogs_type type, const ogs_op op,
-                     const ogs_transpose trans, ogs_t &ogs);
-
-void occaScatterStart(occa::memory& o_v, occa::memory& o_gv,
-                      const int Nentries, const int Nvectors,
-                      const dlong stride, const dlong gstride,
-                      const ogs_type type, const ogs_op op,
-                      const ogs_transpose trans, ogs_t &ogs);
-void occaScatterFinish(occa::memory& o_v, occa::memory& o_gv,
-                      const int Nentries, const int Nvectors,
-                      const dlong stride, const dlong gstride,
-                      const ogs_type type, const ogs_op op,
-                      const ogs_transpose trans, ogs_t &ogs);
-
-void hostGatherScatter(void* v, const int Nentries, const int Nvectors,
-                       const dlong stride, const ogs_type type,
-                       const ogs_op op, const ogs_transpose trans, ogs_t &ogs);
-
-void hostGather(void* gv, void* v, const int Nentries, const int Nvectors,
-                const dlong gstride, const dlong stride,
-                const ogs_type type, const ogs_op op,
-                const ogs_transpose trans, ogs_t &ogs);
-
-void hostScatter(void* v, void* gv, const int Nentries, const int Nvectors,
-                 const dlong stride, const dlong gstride,
-                 const ogs_type type, const ogs_op op,
-                 const ogs_transpose trans, ogs_t &ogs);
-
-void occaGatherScatterKernel(const ogsData_t &gather,
-                             const ogsData_t &scatter,
-                             const int Nentries,
-                             const int Nvectors,
-                             const dlong stride,
-                             const ogs_type type,
-                             const ogs_op op,
-                             occa::memory&  o_v);
-
-void occaGatherKernel(const ogsData_t &gather,
-                      const int Nentries,
-                      const int Nvectors,
-                      const dlong stride,
-                      const dlong gtride,
-                      const ogs_type type,
-                      const ogs_op op,
-                      occa::memory& o_v,
-                      occa::memory& o_gv);
-
-void occaScatterKernel(const ogsData_t &gather,
-                       const int Nentries,
-                       const int Nvectors,
-                       const dlong gtride,
-                       const dlong stride,
-                       const ogs_type type,
-                       const ogs_op op,
-                       occa::memory& o_gv,
-                       occa::memory& o_v);
-
-void hostGatherScatterKernel(const dlong N,
-                             const int Nentries,
-                             const int Nvectors,
-                             const dlong stride,
-                             dlong* gatherStarts,
-                             dlong* gatherIds,
-                             dlong* scatterStarts,
-                             dlong* scatterIds,
-                             const ogs_type type,
-                             const ogs_op op,
-                             void* v);
-
-void hostGatherKernel(const dlong N,
-                      const int Nentries,
-                      const int Nvectors,
-                      const dlong stride,
-                      const dlong gstride,
-                      const dlong *gatherStarts,
-                      const dlong *gatherIds,
-                      const ogs_type type,
-                      const ogs_op op,
-                      const void *v,
-                      void *gv);
-
-void hostScatterKernel(const dlong N,
-                       const int Nentries,
-                       const int Nvectors,
-                       const dlong gstride,
-                       const dlong stride,
-                       const dlong *scatterStarts,
-                       const dlong *scatterIds,
-                       const ogs_type type,
-                       const ogs_op op,
-                       const void *gv,
-                       void *v);
-
-void gsGatherScatter(void* v, const int Nentries, const int Nvectors,
-                     const dlong stride, const ogs_type type, const ogs_op op,
-                     const ogs_transpose trans, void * gsh);
-}
-
-#endif
diff --git a/include/ogs/ogsOperator.hpp b/include/ogs/ogsOperator.hpp
new file mode 100644
index 000000000..dffafc79c
--- /dev/null
+++ b/include/ogs/ogsOperator.hpp
@@ -0,0 +1,147 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#ifndef OGS_OPERATOR_HPP
+#define OGS_OPERATOR_HPP
+
+#include "ogs.hpp"
+
+namespace libp {
+
+namespace ogs {
+
+// The Z operator class is essentially a sparse CSR matrix,
+// with no vals stored. By construction, the sparse
+// matrix will have at most 1 non-zero per column.
+class ogsOperator_t {
+public:
+  platform_t platform;
+
+  dlong Ncols=0;
+  dlong NrowsN=0;
+  dlong NrowsT=0;
+  dlong nnzN=0;
+  dlong nnzT=0;
+
+  memory<dlong> rowStartsN;
+  memory<dlong> rowStartsT;
+  memory<dlong> colIdsN;
+  memory<dlong> colIdsT;
+
+  deviceMemory<dlong> o_rowStartsN;
+  deviceMemory<dlong> o_rowStartsT;
+  deviceMemory<dlong> o_colIdsN;
+  deviceMemory<dlong> o_colIdsT;
+
+  dlong NrowBlocksN=0;
+  dlong NrowBlocksT=0;
+  memory<dlong> blockRowStartsN;
+  memory<dlong> blockRowStartsT;
+  deviceMemory<dlong> o_blockRowStartsN;
+  deviceMemory<dlong> o_blockRowStartsT;
+
+  Kind kind;
+
+  ogsOperator_t()=default;
+  ogsOperator_t(platform_t& _platform)
+   : platform(_platform) {};
+
+  void Free();
+
+  void setupRowBlocks();
+
+  //Apply Z operator
+  template<template<typename> class U,
+           template<typename> class V,
+           typename T>
+  void Gather(U<T> gv, const V<T> v,
+              const int k, const Op op, const Transpose trans);
+
+  template<typename T>
+  void Gather(deviceMemory<T> gv, const deviceMemory<T> v,
+              const int k, const Op op, const Transpose trans);
+
+  //Apply Z^T transpose operator
+  template<template<typename> class U,
+           template<typename> class V,
+           typename T>
+  void Scatter(U<T> v, const V<T> gv,
+               const int k, const Transpose trans);
+
+  template<typename T>
+  void Scatter(deviceMemory<T> v, const deviceMemory<T> gv,
+               const int k, const Transpose trans);
+
+  //Apply Z^T*Z operator
+  template<template<typename> class U,
+           typename T>
+  void GatherScatter(U<T> v, const int k,
+                     const Op op, const Transpose trans);
+
+  template<typename T>
+  void GatherScatter(deviceMemory<T> v, const int k,
+                     const Op op, const Transpose trans);
+
+private:
+  template <template<typename> class U,
+            template<typename> class V,
+            template<typename> class Op,
+            typename T>
+  void Gather(U<T> gv, const V<T> v,
+              const int K, const Transpose trans);
+  template <template<typename> class U,
+            template<typename> class Op,
+            typename T>
+  void GatherScatter(U<T> v, const int K,
+                     const Transpose trans);
+
+  //NC: Hard code these for now. Should be sufficient for GPU devices, but needs attention for CPU
+  static constexpr int blockSize = 256;
+  static constexpr int gatherNodesPerBlock = 512; //should be a multiple of blockSize for good unrolling
+
+  //4 types - Float, Double, Int32, Int64
+  //4 ops - Add, Mul, Max, Min
+  static kernel_t gatherScatterKernel[4][4];
+  static kernel_t gatherKernel[4][4];
+  static kernel_t scatterKernel[4];
+
+  friend void InitializeKernels(platform_t& platform, const Type type, const Op op);
+};
+
+template <template<typename> class U,
+          template<typename> class V,
+          typename T>
+void extract(const dlong N,
+             const int K,
+             const memory<dlong> ids,
+             const U<T> q,
+             V<T> gatherq);
+
+} //namespace ogs
+
+} //namespace libp
+
+#endif
diff --git a/include/ogs/ogsUtils.hpp b/include/ogs/ogsUtils.hpp
new file mode 100644
index 000000000..1e11023ca
--- /dev/null
+++ b/include/ogs/ogsUtils.hpp
@@ -0,0 +1,87 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#ifndef OGS_UTILS_HPP
+#define OGS_UTILS_HPP
+
+#include "ogs.hpp"
+
+namespace libp {
+
+namespace ogs {
+
+struct parallelNode_t{
+
+  dlong localId;    // local node id
+  hlong baseId;     // original global index
+
+  dlong newId;         // new global id
+  int sign;
+
+  int rank; //original rank
+  int destRank; //destination rank
+
+};
+
+template<typename T>
+struct ogsType {
+  static constexpr Type get();
+};
+
+template<> struct ogsType<float> {
+  static constexpr Type get() { return Float; }
+};
+template<> struct ogsType<double> {
+  static constexpr Type get() { return Double; }
+};
+template<> struct ogsType<int> {
+  static constexpr Type get() { return Int32; }
+};
+template<> struct ogsType<long long int> {
+  static constexpr Type get() { return Int64; }
+};
+
+//permute an array A, according to the ordering returned by P
+// i.e. for all n, A[P(n)] <- A[n]
+template<typename T, class Order>
+void permute(const dlong N, memory<T> A, Order P) {
+
+  for(dlong n=0;n<N;++n) {
+    //get what index A[n] should move to
+    dlong pn = P(A[n]);
+    while (pn!=n) {
+      //swap
+      std::swap(A[n], A[pn]);
+      pn = P(A[n]);
+    }
+  }
+}
+
+} //namespace ogs
+
+} //namespace libp
+
+#endif
diff --git a/libs/mesh/meshPlotInterpTri3D.cpp b/include/operator.hpp
similarity index 73%
rename from libs/mesh/meshPlotInterpTri3D.cpp
rename to include/operator.hpp
index 4b5277a93..c1004b4b1 100644
--- a/libs/mesh/meshPlotInterpTri3D.cpp
+++ b/include/operator.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -24,19 +24,21 @@ SOFTWARE.
 
 */
 
-#include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
+#ifndef OPERATOR_HPP
+#define OPERATOR_HPP
 
-//interpolate field to plotting nodes
-void meshTri3D::PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch){
+#include "core.hpp"
 
-  //interpolate
-  for(int n=0;n<plotNp;++n){
-    dfloat qn = 0;
+namespace libp {
 
-    for(int m=0;m<Np;++m){
-      qn += plotInterp[n*Np+m]*q[m];
-    }
-    Iq[n] = qn;
-  }
-}
+//basic operator
+class operator_t {
+public:
+  virtual void Operator(deviceMemory<dfloat> &o_r, deviceMemory<dfloat> &o_Mr) {
+    LIBP_FORCE_ABORT("Operator not implemented in this object");
+  };
+};
+
+} //namespace libp
+
+#endif
diff --git a/libs/mesh/meshPartitionStatistics.cpp b/include/parAdogs.hpp
similarity index 52%
rename from libs/mesh/meshPartitionStatistics.cpp
rename to include/parAdogs.hpp
index 8d236ef6b..b4868eda6 100644
--- a/libs/mesh/meshPartitionStatistics.cpp
+++ b/include/parAdogs.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -24,41 +24,39 @@ SOFTWARE.
 
 */
 
-#include "mesh.hpp"
-
-void mesh_t::PrintPartitionStatistics(){
-
-  /* now gather statistics on connectivity between processes */
-  int *comms = (int*) calloc(size, sizeof(int));
-  int Ncomms = 0;
-
-  /* count elements with neighbors on each other rank ranks */
-  for(dlong e=0;e<Nelements;++e){
-    for(int f=0;f<Nfaces;++f){
-      if(EToP[e*Nfaces+f]!=-1){
-        ++comms[EToP[e*Nfaces+f]];
-        ++Ncomms;
-      }
-    }
-  }
-
-  int Nmessages = 0;
-  for(int rr=0;rr<size;++rr)
-    if(comms[rr]>0)
-      ++Nmessages;
-
-  for(int rr=0;rr<size;++rr){
-    MPI_Barrier(comm);
-    if(rr==rank){
-      fflush(stdout);
-      printf("r: %02d [", rank);
-      for(int ss=0;ss<size;++ss){
-        printf(" %04d", comms[ss]);
-      }
-      printf("] (Nelements=" dlongFormat ", Nmessages=%d, Ncomms=%d)\n", Nelements,Nmessages, Ncomms);
-      fflush(stdout);
-    }
-  }
-
-  free(comms);
-}
+#ifndef PARADOGS_HPP
+#define PARADOGS_HPP 1
+
+#include "core.hpp"
+#include "settings.hpp"
+#include "platform.hpp"
+
+namespace libp {
+
+namespace paradogs {
+
+void AddSettings(settings_t& settings);
+void ReportSettings(settings_t& settings);
+
+void MeshPartition(platform_t &platform,
+                   settings_t &settings,
+                   dlong &Nelements,
+                   const  int dim,
+                   const  int Nverts,
+                   const  int Nfaces,
+                   const  int NfaceVertices,
+                   const  memory<int>& faceVertices,
+                   memory<hlong>& EToV,
+                   memory<hlong>& EToE,
+                   memory<int>& EToF,
+                   memory<dfloat>& EX,
+                   memory<dfloat>& EY,
+                   memory<dfloat>& EZ,
+                   comm_t comm);
+
+} //namespace paradogs
+
+} //namespace libp
+
+#endif
+
diff --git a/include/parAdogs/parAdogsGraph.hpp b/include/parAdogs/parAdogsGraph.hpp
new file mode 100644
index 000000000..45d73cd15
--- /dev/null
+++ b/include/parAdogs/parAdogsGraph.hpp
@@ -0,0 +1,159 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#ifndef PARADOGS_GRAPH_HPP
+#define PARADOGS_GRAPH_HPP 1
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsMatrix.hpp"
+#include "parAdogs/parAdogsMultigrid.hpp"
+
+namespace libp {
+
+namespace paradogs {
+
+class graph_t {
+public:
+  /*Mesh data*/
+  static constexpr int MAX_NVERTS=8;
+  static constexpr int MAX_NFACES=6;
+  static constexpr int MAX_NFACEVERTS=4;
+
+private:
+  platform_t platform;
+
+  comm_t gcomm;
+  comm_t comm;
+
+  int rank, size;
+  dlong Nverts=0, Nhalo=0;
+  hlong NVertsGlobal=0;
+  hlong VoffsetL=0, VoffsetU=0;
+
+  int grank, gsize;
+  hlong gNVertsGlobal=0;
+  hlong gVoffsetL=0, gVoffsetU=0;
+  
+
+  dlong Nelements=0;
+  int dim=0;
+  int Nfaces=0;
+  int NelementVerts=0;
+  int NfaceVerts=0;
+  struct element_t {
+    dfloat EX[MAX_NVERTS]; //x coordinates of verts
+    dfloat EY[MAX_NVERTS]; //y coordinates of verts
+    dfloat EZ[MAX_NVERTS]; //z coordinates of verts
+    hlong V[MAX_NVERTS];   //Global Vertex Ids of verts
+
+    hlong E[MAX_NFACES];   //Global element ids of neighbors
+    int F[MAX_NFACES];     //Face ids of neighbors
+  };
+  memory<element_t> elements;
+
+  int faceVerts[MAX_NFACES*MAX_NFACEVERTS];
+
+  /*Multilevel Laplacian (for spectral partitioning)*/
+  static constexpr int MAX_LEVELS=100;
+  int Nlevels=0;
+  mgLevel_t L[MAX_LEVELS];
+  coarseSolver_t coarseSolver;
+
+  memory<hlong> colIds;
+
+public:
+  /*Build a graph from mesh connectivity info*/
+  graph_t(platform_t &_platform,
+          const dlong _Nelements,
+          const int _dim,
+          const int _Nverts,
+          const int _Nfaces,
+          const int _NfaceVerts,
+          const memory<int>& faceVertices,
+          const memory<hlong>& EToV,
+          const memory<dfloat>& EX,
+          const memory<dfloat>& EY,
+          const memory<dfloat>& EZ,
+          comm_t _comm);
+
+  void InertialPartition();
+
+  void SpectralPartition();
+
+  void Connect();
+
+  void CuthillMckee();
+
+  void Report();
+
+  void ExtractMesh(dlong &Nelements_,
+                   memory<hlong>& EToV,
+                   memory<hlong>& EToE,
+                   memory<int>& EToF,
+                   memory<dfloat>& EX,
+                   memory<dfloat>& EY,
+                   memory<dfloat>& EZ);
+
+private:
+  void InertialBipartition(const dfloat targetFraction[2]);
+  void SpectralBipartition(const dfloat targetFraction[2]);
+
+
+  /*Divide graph into two pieces according to a bisection*/
+  void Split(const memory<int>& partition);
+
+  void CreateLaplacian();
+
+  /*Compute Fiedler vector of graph */
+  memory<dfloat>& FiedlerVector();
+
+  /*Improve a Fiedler vector*/
+  void Refine(const int level);
+
+  /* Solve A_{l}*x = b*/
+  int Solve(const int level,
+            const dfloat TOL,
+            memory<dfloat>& r,
+            memory<dfloat>& x,
+            memory<dfloat>& scratch);
+
+  /*Create multilevel heirarchy*/
+  void MultigridSetup();
+
+  void MultigridVcycle(const int l,
+                       memory<dfloat>& r,
+                       memory<dfloat>& x);
+
+  /*Clear multilevel heirarchy*/
+  void MultigridDestroy();
+};
+
+} //namespace paradogs
+
+} //namespace libp
+
+#endif
+
diff --git a/include/parAdogs/parAdogsMatrix.hpp b/include/parAdogs/parAdogsMatrix.hpp
new file mode 100644
index 000000000..cc9062f0f
--- /dev/null
+++ b/include/parAdogs/parAdogsMatrix.hpp
@@ -0,0 +1,128 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#ifndef PARADOGS_MATRIX_HPP
+#define PARADOGS_MATRIX_HPP 1
+
+#include "parAdogs.hpp"
+#include "ogs.hpp"
+
+namespace libp {
+
+namespace paradogs {
+
+struct nonZero_t {
+  hlong row;
+  hlong col;
+  dfloat val;
+};
+
+
+class parCSR {
+public:
+  platform_t platform;
+  comm_t comm;
+
+  dlong Nrows=0;
+  dlong Ncols=0;
+
+  //partition info
+  hlong rowOffsetL=0, rowOffsetU=0;
+  hlong colOffsetL=0, colOffsetU=0;
+
+  //local sparse matrix
+  struct CSR {
+    dlong nnz=0;
+    memory<dlong>  rowStarts;
+    memory<dlong>  cols;
+    memory<pfloat> vals;
+  };
+  CSR diag;
+
+  //non-local sparse matrix
+  struct MCSR {
+    dlong nnz=0;
+    dlong nzRows=0;
+
+    memory<dlong>  rowStarts;
+    memory<dlong>  mRowStarts;
+    memory<dlong>  rows;
+    memory<dlong>  cols;
+    memory<pfloat> vals;
+  };
+  MCSR offd;
+
+  memory<dfloat> diagA;
+  memory<dfloat> diagInv;
+
+  /*communcation info*/
+  dlong NlocalCols = 0;
+  ogs::halo_t halo;
+  memory<hlong> colMap;
+
+  //rho ~= cond(invD * A)
+  dfloat rho=0.0;
+
+  parCSR()=default;
+  parCSR(dlong N, dlong M, platform_t& _platform, comm_t _comm):
+    platform(_platform), comm(_comm), Nrows(N), Ncols(M) {}
+
+  //build a parCSR matrix from a distributed COO matrix
+  parCSR(dlong _Nrows, dlong _Ncols,
+         const dlong NNZ,
+         memory<nonZero_t>& entries,
+         const platform_t &_platform,
+         comm_t comm);
+
+  void haloSetup(memory<hlong>& colIds);
+
+  // estimate rho(invD * A)
+  dfloat rhoDinvA(memory<dfloat>& null);
+
+  /*Aggregate via distance-2 PMIS*/
+  void Aggregate(dlong& cNverts,
+                 const dfloat theta,
+                 memory<hlong>& FineToCoarse);
+
+  void GalerkinProduct(const parCSR &A, const parCSR &P);
+
+  void SpMV(const dfloat alpha, memory<dfloat>& x,
+            const dfloat beta, memory<dfloat>& y);
+  void SpMV(const dfloat alpha, memory<dfloat>& x,
+            const dfloat beta, const memory<dfloat>& y, memory<dfloat>& z);
+
+  void SmoothChebyshev(memory<dfloat>& b, memory<dfloat>& x,
+                       const dfloat lambda0, const dfloat lambda1,
+                       const bool xIsZero, memory<dfloat>& scratch,
+                       const int ChebyshevIterations);
+};
+
+} //namespace paradogs
+
+} //namespace libp
+
+#endif
+
diff --git a/include/parAdogs/parAdogsMultigrid.hpp b/include/parAdogs/parAdogsMultigrid.hpp
new file mode 100644
index 000000000..086498f13
--- /dev/null
+++ b/include/parAdogs/parAdogsMultigrid.hpp
@@ -0,0 +1,121 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#ifndef PARADOGS_MULTIGRID_HPP
+#define PARADOGS_MULTIGRID_HPP 1
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsMatrix.hpp"
+
+namespace libp {
+
+namespace paradogs {
+
+class mgLevel_t {
+public:
+  dlong Nrows=0, Ncols=0;
+  hlong Nglobal=0;
+
+  parCSR A, P, R;
+
+  /*null vector*/
+  memory<dfloat> null;
+
+  /*Fiedler vector*/
+  memory<dfloat> Fiedler;
+
+  /*Vcycle storage*/
+  memory<dfloat> RHS;
+  memory<dfloat> X;
+  memory<dfloat> RES;
+  memory<dfloat> scratch;
+
+  dfloat lambda1, lambda0; //smoothing params
+
+  /*Create graph Laplacian*/
+  void CreateLaplacian(const dlong Nelements,
+                       const int Nfaces,
+                       const memory<hlong>& EToE,
+                       comm_t comm);
+
+  /*Construct a coarse level*/
+  void CoarsenLevel(mgLevel_t &Lf, const dfloat theta);
+
+  void SetupSmoother();
+
+  void AllocateScratch(const int l);
+
+  /*Compute Fiedler vector directly*/
+  void FiedlerVector();
+
+  /*Multigrid functions*/
+  void Smooth(memory<dfloat>& r, memory<dfloat>& x, const bool xIsZero);
+  void Residual(memory<dfloat>& r, memory<dfloat>& x, memory<dfloat>& res);
+  void Coarsen(memory<dfloat>& x, memory<dfloat>& xC);
+  void Prolongate(memory<dfloat>& xC, memory<dfloat>& x);
+};
+
+parCSR TentativeProlongator(const dlong Nf,
+                            const dlong Nc,
+                            platform_t& platform,
+                            comm_t comm,
+                            memory<hlong>& FineToCoarse,
+                            memory<dfloat>& FineNull,
+                            memory<dfloat>& CoarseNull);
+
+parCSR SmoothProlongator(const parCSR& A,
+                         const parCSR& T);
+
+parCSR Transpose(const parCSR& A);
+
+parCSR SpMM(const parCSR& A, const parCSR& B);
+
+class coarseSolver_t {
+
+public:
+  comm_t comm;
+
+  int N=0;
+  int Nrows=0;
+  int Ncols=0;
+
+  int coarseTotal=0;
+  memory<int> coarseCounts;
+  memory<int> coarseOffsets;
+
+  memory<dfloat> invA;
+  memory<dfloat> grhs;
+
+  void Setup(parCSR& A, memory<dfloat>& null);
+  void Solve(memory<dfloat>& r, memory<dfloat>& x);
+};
+
+} //namespace paradogs
+
+} //namespace libp
+
+#endif
+
diff --git a/include/mesh/meshDefines2D.h b/include/parAdogs/parAdogsPartition.hpp
similarity index 67%
rename from include/mesh/meshDefines2D.h
rename to include/parAdogs/parAdogsPartition.hpp
index 1d656753b..af102c878 100644
--- a/include/mesh/meshDefines2D.h
+++ b/include/parAdogs/parAdogsPartition.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -24,32 +24,22 @@ SOFTWARE.
 
 */
 
-#ifndef MESH_DEFINES2D_H
-#define MESH_DEFINES2D_H 1
-
-/* offsets for geometric factors */
-#define RXID 0
-#define RYID 1
-#define SXID 2
-#define SYID 3
-#define  JID 4
-#define JWID 5
-#define IJWID 6
-
-/* offsets for second order geometric factors */
-#define G00ID 0
-#define G01ID 1
-#define G11ID 2
-#define GWJID 3
-
-/* offsets for nx, ny, sJ, 1/J */
-#define NXID 0
-#define NYID 1
-#define SJID 2
-#define IJID 3
-#define IHID 4
-#define WSJID 5
-#define WIJID 6
+#ifndef PARADOGS_PARTITION_HPP
+#define PARADOGS_PARTITION_HPP 1
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsGraph.hpp"
+
+namespace libp {
+
+namespace paradogs {
+
+dfloat ParallelPivot(const dlong N, memory<dfloat>& F,
+                     const hlong k, comm_t comm);
+
+} //namespace paradogs
+
+} //namespace libp
 
 #endif
 
diff --git a/include/parAlmond.hpp b/include/parAlmond.hpp
index 45ee3a1d7..7d9f2d5a0 100644
--- a/include/parAlmond.hpp
+++ b/include/parAlmond.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -35,22 +35,22 @@ SOFTWARE.
 #include "precon.hpp"
 #include "linearSolver.hpp"
 
+namespace libp {
+
 namespace parAlmond {
 
-void AddSettings(settings_t& settings, const string prefix="");
+void AddSettings(settings_t& settings, const std::string prefix="");
 void ReportSettings(settings_t& settings);
 
-extern MPI_Datatype MPI_NONZERO_T;
-
 //distributed matrix class passed to AMG setup
 class parCOO {
 public:
-  platform_t &platform;
-  MPI_Comm comm;
+  platform_t platform;
+  comm_t comm;
 
   dlong nnz=0;
-  hlong *globalRowStarts=nullptr;
-  hlong *globalColStarts=nullptr;
+  memory<hlong> globalRowStarts;
+  memory<hlong> globalColStarts;
 
   //non-zero matrix entries
   struct nonZero_t {
@@ -58,70 +58,183 @@ class parCOO {
     hlong col;
     dfloat val;
   };
-  nonZero_t *entries=nullptr;
+  memory<nonZero_t> entries;
 
-  parCOO(platform_t &_platform, MPI_Comm _comm):
+  parCOO() = default;
+  parCOO(platform_t &_platform, comm_t _comm):
     platform(_platform), comm(_comm) {};
-
-  ~parCOO() {
-    if(entries) free(entries);
-    if(globalRowStarts) free(globalRowStarts);
-    if(globalColStarts) free(globalColStarts);
-  }
 };
 
 //abstract multigrid level
 // Class is derived from solver, and must have Operator defined
-class multigridLevel: public solver_t  {
+class multigridLevel: public operator_t  {
 public:
+  platform_t platform;
+  settings_t settings;
+  comm_t comm;
+
   dlong Nrows=0, Ncols=0;
 
-  occa::memory o_scratch;
+  deviceMemory<dfloat> o_scratch;
 
+  multigridLevel() = default;
   multigridLevel(dlong N, dlong M, platform_t& _platform,
-                 settings_t& _settings):
-    solver_t(_platform, _settings), Nrows(N), Ncols(M) {}
-  virtual ~multigridLevel() {};
-
-  virtual void smooth(occa::memory& o_rhs, occa::memory& o_x, bool x_is_zero)=0;
-  virtual void residual(occa::memory& o_rhs, occa::memory& o_x, occa::memory& o_res)=0;
-  virtual void coarsen(occa::memory& o_x, occa::memory& o_Cx)=0;
-  virtual void prolongate(occa::memory& o_x, occa::memory& o_Px)=0;
+                 settings_t& _settings, comm_t _comm):
+    platform(_platform), settings(_settings),
+    comm(_comm), Nrows(N), Ncols(M) {}
+
+  virtual void smooth(deviceMemory<dfloat>& o_rhs, deviceMemory<dfloat>& o_x, bool x_is_zero)=0;
+  virtual void residual(deviceMemory<dfloat>& o_rhs, deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_res)=0;
+  virtual void coarsen(deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_Cx)=0;
+  virtual void prolongate(deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_Px)=0;
   virtual void Report()=0;
 };
 
-//forward declaration
+typedef enum {VCYCLE=0,KCYCLE=1,EXACT=3} CycleType;
+typedef enum {SMOOTHED=0,UNSMOOTHED=1} AggType;
+typedef enum {PCG=0,GMRES=1} KrylovType;
+typedef enum {DAMPED_JACOBI=0,CHEBYSHEV=1} SmoothType;
+typedef enum {RUGESTUBEN=0,SYMMETRIC=1} StrengthType;
+typedef enum {COARSEEXACT=0,COARSEOAS=1} CoarseType;
+
+class coarseSolver_t;
+
 //multigrid preconditioner
-class multigrid_t;
+class multigrid_t: public operator_t {
+public:
+  platform_t platform;
+  settings_t settings;
+  comm_t comm;
+
+  bool exact=false;
+  linearSolver_t linearSolver;
+
+  CycleType ctype;
+  AggType aggtype;
+  StrengthType strtype;
+  CoarseType coarsetype;
+
+  int numLevels=0;
+  int baseLevel=0;
+  static constexpr int PARALMOND_MAX_LEVELS=100;
+  std::shared_ptr<multigridLevel> levels[PARALMOND_MAX_LEVELS];
+
+  deviceMemory<dfloat> o_rhs[PARALMOND_MAX_LEVELS];
+  deviceMemory<dfloat> o_x[PARALMOND_MAX_LEVELS];
+
+  std::shared_ptr<coarseSolver_t> coarseSolver;
+
+  //scratch space for smoothing and temporary residual vector
+  size_t NscratchSpace=0;
+  deviceMemory<dfloat> o_scratch;
+
+  KrylovType ktype;
+
+  deviceMemory<dfloat> o_ck[PARALMOND_MAX_LEVELS];
+  deviceMemory<dfloat> o_vk[PARALMOND_MAX_LEVELS];
+  deviceMemory<dfloat> o_wk[PARALMOND_MAX_LEVELS];
+
+  //scratch space
+  size_t NreductionScratch=0;
+  pinnedMemory<dfloat> reductionScratch;
+  deviceMemory<dfloat> o_reductionScratch;
 
-class parAlmond_t: public precon_t {
+  multigrid_t() = default;
+  multigrid_t(platform_t& _platform, settings_t& _settings,
+              comm_t _comm);
+
+  template<class Level, class... Args>
+  Level& AddLevel(Args&& ... args) {
+    levels[numLevels++] = std::make_shared<Level>(args...);
+    AllocateLevelWorkSpace(numLevels-1);
+    return dynamic_cast<Level&>(*levels[numLevels-1]);
+  }
+  template<class Level>
+  Level& GetLevel(const int l) {
+    return dynamic_cast<Level&>(*levels[l]);
+  }
+
+  void AllocateLevelWorkSpace(const int k);
+
+  void Operator(deviceMemory<dfloat>& o_RHS, deviceMemory<dfloat>& o_X);
+
+  void vcycle(const int k, deviceMemory<dfloat>& o_RHS, deviceMemory<dfloat>& o_X);
+  void kcycle(const int k, deviceMemory<dfloat>& o_RHS, deviceMemory<dfloat>& o_X);
+
+private:
+  void kcycleOp1(multigridLevel& level,
+                 deviceMemory<dfloat>& o_X,  deviceMemory<dfloat>& o_RHS,
+                 deviceMemory<dfloat>& o_CK, deviceMemory<dfloat>& o_VK,
+                 dfloat& alpha1, dfloat& rho1,
+                 dfloat& norm_rhs, dfloat& norm_rhstilde);
+
+  void kcycleOp2(multigridLevel& level,
+                deviceMemory<dfloat>& o_X,  deviceMemory<dfloat>& o_RHS,
+                deviceMemory<dfloat>& o_CK, deviceMemory<dfloat>& o_VK, deviceMemory<dfloat>& o_WK,
+                const dfloat alpha1, const dfloat rho1);
+
+  void kcycleCombinedOp1(multigridLevel& level,
+                        deviceMemory<dfloat>& o_a,
+                        deviceMemory<dfloat>& o_b,
+                        deviceMemory<dfloat>& o_c,
+                        dfloat& aDotb,
+                        dfloat& aDotc,
+                        dfloat& bDotb);
+  void kcycleCombinedOp2(multigridLevel& level,
+                        deviceMemory<dfloat>& o_a,
+                        deviceMemory<dfloat>& o_b,
+                        deviceMemory<dfloat>& o_c,
+                        deviceMemory<dfloat>& o_d,
+                        dfloat& aDotb,
+                        dfloat& aDotc,
+                        dfloat& aDotd);
+  dfloat vectorAddInnerProd(multigridLevel& level,
+                          const dfloat alpha, deviceMemory<dfloat>& o_x,
+                          const dfloat beta,  deviceMemory<dfloat>& o_y);
+};
+
+class parAlmond_t: public operator_t {
 public:
-  parAlmond_t(platform_t& _platform, settings_t& settings_, MPI_Comm comm);
-  ~parAlmond_t();
+  parAlmond_t() = default;
+  parAlmond_t(platform_t& _platform, settings_t& _settings, comm_t _comm) {
+    Setup(_platform, _settings, _comm);
+  }
+
+  void Setup(platform_t& _platform, settings_t& _settings, comm_t _comm);
 
-  //Add level to multigrid heirarchy
-  void AddLevel(multigridLevel* level);
+  template<class Level, class... Args>
+  Level& AddLevel(Args&& ... args) {
+    return multigrid->AddLevel<Level, Args...>(args...);
+  }
+  template<class Level>
+  Level& GetLevel(const int l) {
+    return multigrid->GetLevel<Level>(l);
+  }
+
+  int NumLevels();
 
   // Setup AMG
   //-- Local A matrix data must be globally indexed & row sorted
   void AMGSetup(parCOO& A,
                bool nullSpace,
-               dfloat *nullVector,
+               memory<dfloat> nullVector,
                dfloat nullSpacePenalty);
 
-  void Operator(occa::memory& o_rhs, occa::memory& o_x);
+  void Operator(deviceMemory<dfloat>& o_rhs, deviceMemory<dfloat>& o_x);
 
   void Report();
 
   dlong getNumCols(int k);
   dlong getNumRows(int k);
 private:
-  platform_t& platform;
-  settings_t& settings;
+  platform_t platform;
+  settings_t settings;
 
-  multigrid_t *multigrid=nullptr;
+  std::shared_ptr<multigrid_t> multigrid=nullptr;
 };
 
 } //namespace parAlmond
 
+} //namespace libp
+
 #endif
diff --git a/include/parAlmond/parAlmondAMGLevel.hpp b/include/parAlmond/parAlmondAMGLevel.hpp
index 4f1f556a5..9cb5a5113 100644
--- a/include/parAlmond/parAlmondAMGLevel.hpp
+++ b/include/parAlmond/parAlmondAMGLevel.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -30,30 +30,31 @@ SOFTWARE.
 #include "parAlmond.hpp"
 #include "parAlmond/parAlmondparCSR.hpp"
 
+namespace libp {
 
 namespace parAlmond {
 
 class amgLevel: public multigridLevel {
 
 public:
-  parCSR *A=nullptr, *P=nullptr, *R=nullptr;
+  parCSR A, P, R;
 
   SmoothType stype;
   dfloat lambda, lambda1, lambda0; //smoothing params
 
   int ChebyshevIterations=2;
 
-  amgLevel(parCSR *AA, settings_t& _settings);
-  ~amgLevel();
+  amgLevel() = default;
+  amgLevel(parCSR& AA, settings_t& _settings);
 
-  void Operator(occa::memory& o_x, occa::memory& o_Ax);
-  void residual(occa::memory& o_rhs, occa::memory& o_x, occa::memory& o_res);
-  void coarsen(occa::memory& o_x, occa::memory& o_Cx);
-  void prolongate(occa::memory& o_x, occa::memory& o_Px);
+  void Operator(deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_Ax);
+  void residual(deviceMemory<dfloat>& o_rhs, deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_res);
+  void coarsen(deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_Cx);
+  void prolongate(deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_Px);
 
-  void smooth(occa::memory& o_rhs, occa::memory& o_x, bool x_is_zero);
-  void smoothDampedJacobi(occa::memory& o_r, occa::memory& o_x, bool x_is_zero);
-  void smoothChebyshev(occa::memory& o_r, occa::memory& o_x, bool x_is_zero);
+  void smooth(deviceMemory<dfloat>& o_rhs, deviceMemory<dfloat>& o_x, bool x_is_zero);
+  void smoothDampedJacobi(deviceMemory<dfloat>& o_r, deviceMemory<dfloat>& o_x, bool x_is_zero);
+  void smoothChebyshev(deviceMemory<dfloat>& o_r, deviceMemory<dfloat>& o_x, bool x_is_zero);
 
   void Report();
 
@@ -62,6 +63,8 @@ class amgLevel: public multigridLevel {
   void syncToDevice();
 };
 
-}
+} //namespace parAlmond
+
+} //namespace libp
 
 #endif
diff --git a/include/parAlmond/parAlmondAMGSetup.hpp b/include/parAlmond/parAlmondAMGSetup.hpp
index 58c80c203..442cc2214 100644
--- a/include/parAlmond/parAlmondAMGSetup.hpp
+++ b/include/parAlmond/parAlmondAMGSetup.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -28,52 +28,50 @@ SOFTWARE.
 #define PARALMOND_AMGSETUP_HPP
 
 #include "parAlmond.hpp"
-#include "parAlmond/parAlmondMultigrid.hpp"
 #include "parAlmond/parAlmondAMGLevel.hpp"
 
+namespace libp {
 
 namespace parAlmond {
 
 class strongGraph_t {
 public:
-  platform_t& platform;
-  MPI_Comm comm;
+  platform_t platform;
+  comm_t comm;
   dlong Nrows=0;
   dlong Ncols=0;
   dlong nnz=0;
 
-  dlong  *rowStarts=nullptr;
-  dlong  *cols=nullptr;
+  memory<dlong> rowStarts;
+  memory<dlong> cols;
 
-  strongGraph_t(dlong N, dlong M, platform_t& _platform, MPI_Comm _comm):
+  strongGraph_t(dlong N, dlong M, platform_t& _platform, comm_t _comm):
     platform(_platform), comm(_comm), Nrows(N), Ncols(M) {}
-  ~strongGraph_t() {
-    if (rowStarts) free(rowStarts);
-    if (cols) free(cols);
-  }
 };
 
-amgLevel *coarsenAmgLevel(amgLevel *level, dfloat *null,
-                          StrengthType strtype, dfloat theta,
-                          AggType aggtype);
+amgLevel coarsenAmgLevel(amgLevel& level, memory<dfloat>& null,
+                         StrengthType strtype, dfloat theta,
+                         AggType aggtype);
 
-strongGraph_t* strongGraph(parCSR *A, StrengthType type, dfloat theta);
+strongGraph_t strongGraph(parCSR& A, StrengthType type, dfloat theta);
 
-void formAggregates(parCSR *A, strongGraph_t *C,
-                     hlong* FineToCoarse,
-                     hlong* globalAggStarts);
+void formAggregates(parCSR& A, strongGraph_t& C,
+                    memory<hlong> FineToCoarse,
+                    memory<hlong> globalAggStarts);
 
-parCSR *tentativeProlongator(parCSR *A, hlong *FineToCoarse,
-                            hlong *globalAggStarts, dfloat *null);
+parCSR tentativeProlongator(parCSR& A, memory<hlong> FineToCoarse,
+                            memory<hlong> globalAggStarts, memory<dfloat> null);
 
-parCSR *smoothProlongator(parCSR *A, parCSR *T);
+parCSR smoothProlongator(parCSR& A, parCSR& T);
 
-parCSR *transpose(parCSR *A);
+parCSR transpose(parCSR& A);
 
-parCSR *SpMM(parCSR *A, parCSR *B);
+parCSR SpMM(parCSR& A, parCSR& B);
 
-parCSR *galerkinProd(parCSR *A, parCSR *P);
+parCSR galerkinProd(parCSR& A, parCSR& P);
 
-}
+} //namespace parAlmond
+
+} //namespace libp
 
 #endif
diff --git a/include/parAlmond/parAlmondCoarseSolver.hpp b/include/parAlmond/parAlmondCoarseSolver.hpp
index 8ff955633..3cd08c791 100644
--- a/include/parAlmond/parAlmondCoarseSolver.hpp
+++ b/include/parAlmond/parAlmondCoarseSolver.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -34,100 +34,105 @@ SOFTWARE.
 #include "parAlmond/parAlmondDefines.hpp"
 #include "parAlmond/parAlmondparCSR.hpp"
 
+namespace libp {
+
 namespace parAlmond {
 
-class coarseSolver_t: public solver_t {
+class coarseSolver_t: public operator_t {
 
 public:
+  platform_t platform;
+  settings_t settings;
+  comm_t comm;
+
   int Nrows;
   int Ncols;
 
-  MPI_Comm comm;
   int rank, size;
 
   coarseSolver_t(platform_t& _platform, settings_t& _settings,
-                 MPI_Comm _comm):
-    solver_t(_platform, _settings), comm(_comm) {}
-  virtual ~coarseSolver_t() {}
+                 comm_t _comm):
+    platform(_platform), settings(_settings),
+    comm(_comm) {}
 
   virtual int getTargetSize()=0;
 
-  virtual void setup(parCSR *A, bool nullSpace,
-                     dfloat *nullVector, dfloat nullSpacePenalty)=0;
+  virtual void setup(parCSR& A, bool nullSpace,
+                     memory<dfloat> nullVector, dfloat nullSpacePenalty)=0;
 
   virtual void syncToDevice()=0;
 
   virtual void Report(int lev)=0;
 
-  virtual void solve(occa::memory& o_rhs, occa::memory& o_x)=0;
+  virtual void solve(deviceMemory<dfloat>& o_rhs, deviceMemory<dfloat>& o_x)=0;
 };
 
 class exactSolver_t: public coarseSolver_t {
 
 public:
-  parCSR *A=nullptr;
+  parCSR A;
 
   int coarseTotal;
   int coarseOffset;
-  int *coarseOffsets=nullptr;
-  int *coarseCounts=nullptr;
-  int *sendOffsets=nullptr;
-  int *sendCounts=nullptr;
+  memory<int> coarseOffsets;
+  memory<int> coarseCounts;
+  memory<int> sendOffsets;
+  memory<int> sendCounts;
 
   int N;
   int offdTotal=0;
 
-  dfloat *diagInvAT=nullptr, *offdInvAT=nullptr;
-  occa::memory o_diagInvAT, o_offdInvAT;
+  memory<dfloat> diagInvAT, offdInvAT;
+  deviceMemory<dfloat> o_diagInvAT, o_offdInvAT;
 
-  dfloat *diagRhs=nullptr, *offdRhs=nullptr;
-  occa::memory o_offdRhs;
+  memory<dfloat> diagRhs, offdRhs;
+  deviceMemory<dfloat> o_offdRhs;
 
   exactSolver_t(platform_t& _platform, settings_t& _settings,
-                MPI_Comm _comm):
+                comm_t _comm):
     coarseSolver_t(_platform, _settings, _comm) {}
-  ~exactSolver_t();
 
   int getTargetSize();
 
-  void setup(parCSR *A, bool nullSpace,
-             dfloat *nullVector, dfloat nullSpacePenalty);
+  void setup(parCSR& A, bool nullSpace,
+             memory<dfloat> nullVector, dfloat nullSpacePenalty);
 
   void syncToDevice();
 
   void Report(int lev);
 
-  void solve(occa::memory& o_rhs, occa::memory& o_x);
+  void solve(deviceMemory<dfloat>& o_rhs, deviceMemory<dfloat>& o_x);
 };
 
 class oasSolver_t: public coarseSolver_t {
 
 public:
-  parCSR* A;
+  parCSR A;
 
   int N;
   int diagTotal=0, offdTotal=0;
 
-  dfloat *diagInvAT=nullptr, *offdInvAT=nullptr;
-  occa::memory o_diagInvAT, o_offdInvAT;
+  memory<dfloat> diagInvAT, offdInvAT;
+  deviceMemory<dfloat> o_diagInvAT, o_offdInvAT;
 
   oasSolver_t(platform_t& _platform, settings_t& _settings,
-              MPI_Comm _comm):
+              comm_t _comm):
     coarseSolver_t(_platform, _settings, _comm) {}
-  ~oasSolver_t();
 
   int getTargetSize();
 
-  void setup(parCSR *A, bool nullSpace,
-             dfloat *nullVector, dfloat nullSpacePenalty);
+  void setup(parCSR& A, bool nullSpace,
+             memory<dfloat> nullVector, dfloat nullSpacePenalty);
 
   void syncToDevice();
 
   void Report(int lev);
 
-  void solve(occa::memory& o_rhs, occa::memory& o_x);
+  void solve(deviceMemory<dfloat>& o_rhs, deviceMemory<dfloat>& o_x);
 };
 
-}
+} //namespace parAlmond
+
+} //namespace libp
 
-#endif
\ No newline at end of file
+#endif
diff --git a/include/parAlmond/parAlmondDefines.hpp b/include/parAlmond/parAlmondDefines.hpp
index d60afa9a6..5aaa72d2a 100644
--- a/include/parAlmond/parAlmondDefines.hpp
+++ b/include/parAlmond/parAlmondDefines.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,8 +27,16 @@ SOFTWARE.
 #ifndef PARALMOND_DEFINES_HPP
 #define PARALMOND_DEFINES_HPP
 
-#define PARALMOND_NBLOCKS 128
-#define NUMKCYCLES 3
-#define KCYCLETOL 0.2
+namespace libp {
 
-#endif
\ No newline at end of file
+namespace parAlmond {
+
+constexpr int PARALMOND_NBLOCKS=128;
+constexpr int NUMKCYCLES=3;
+constexpr dfloat KCYCLETOL=0.2;
+
+} //namespace parAlmond
+
+} //namespace libp
+
+#endif
diff --git a/include/parAlmond/parAlmondKernels.hpp b/include/parAlmond/parAlmondKernels.hpp
index f6d388389..80efb2280 100644
--- a/include/parAlmond/parAlmondKernels.hpp
+++ b/include/parAlmond/parAlmondKernels.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,38 +27,41 @@ SOFTWARE.
 #ifndef PARALMOND_KERNELS_HPP
 #define PARALMOND_KERNELS_HPP
 
+namespace libp {
+
 namespace parAlmond {
 
   void buildParAlmondKernels(platform_t& platform);
 
   void freeParAlmondKernels();
 
-  extern int Nrefs;
-
-  extern const int blockSize;
-  extern const int NonzerosPerBlock;
+  //NC: Hard code these for now. Should be sufficient for GPU devices, but needs attention for CPU
+  constexpr int blockSize = 256;
+  constexpr int NonzerosPerBlock = 2048; //should be a multiple of blockSize for good unrolling
 
-  extern occa::kernel SpMVcsrKernel1;
-  extern occa::kernel SpMVcsrKernel2;
-  extern occa::kernel SpMVmcsrKernel;
+  extern kernel_t SpMVcsrKernel1;
+  extern kernel_t SpMVcsrKernel2;
+  extern kernel_t SpMVmcsrKernel;
 
-  extern occa::kernel SmoothJacobiCSRKernel;
-  extern occa::kernel SmoothJacobiMCSRKernel;
+  extern kernel_t SmoothJacobiCSRKernel;
+  extern kernel_t SmoothJacobiMCSRKernel;
 
-  extern occa::kernel SmoothChebyshevStartKernel;
-  extern occa::kernel SmoothChebyshevCSRKernel;
-  extern occa::kernel SmoothChebyshevMCSRKernel;
-  extern occa::kernel SmoothChebyshevUpdateKernel;
+  extern kernel_t SmoothChebyshevStartKernel;
+  extern kernel_t SmoothChebyshevCSRKernel;
+  extern kernel_t SmoothChebyshevMCSRKernel;
+  extern kernel_t SmoothChebyshevUpdateKernel;
 
-  extern occa::kernel vectorAddInnerProdKernel;
-  extern occa::kernel vectorAddWeightedInnerProdKernel;
-  extern occa::kernel kcycleCombinedOp1Kernel;
-  extern occa::kernel kcycleCombinedOp2Kernel;
-  extern occa::kernel kcycleWeightedCombinedOp1Kernel;
-  extern occa::kernel kcycleWeightedCombinedOp2Kernel;
+  extern kernel_t vectorAddInnerProdKernel;
+  extern kernel_t vectorAddWeightedInnerProdKernel;
+  extern kernel_t kcycleCombinedOp1Kernel;
+  extern kernel_t kcycleCombinedOp2Kernel;
+  extern kernel_t kcycleWeightedCombinedOp1Kernel;
+  extern kernel_t kcycleWeightedCombinedOp2Kernel;
 
-  extern occa::kernel dGEMVKernel;
+  extern kernel_t dGEMVKernel;
 
 } //namespace parAlmond
 
-#endif
\ No newline at end of file
+} // namespace libp
+
+#endif
diff --git a/include/parAlmond/parAlmondMultigrid.hpp b/include/parAlmond/parAlmondMultigrid.hpp
deleted file mode 100644
index 01aabfbe2..000000000
--- a/include/parAlmond/parAlmondMultigrid.hpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#ifndef PARALMOND_MULTIGRID_HPP
-#define PARALMOND_MULTIGRID_HPP
-
-#include "settings.hpp"
-#include "platform.hpp"
-#include "solver.hpp"
-#include "precon.hpp"
-#include "parAlmond.hpp"
-#include "parAlmond/parAlmondDefines.hpp"
-#include "parAlmond/parAlmondCoarseSolver.hpp"
-
-namespace parAlmond {
-
-#define PARALMOND_MAX_LEVELS 100
-
-typedef enum {VCYCLE=0,KCYCLE=1,EXACT=3} CycleType;
-typedef enum {SMOOTHED=0,UNSMOOTHED=1} AggType;
-typedef enum {PCG=0,GMRES=1} KrylovType;
-typedef enum {DAMPED_JACOBI=0,CHEBYSHEV=1} SmoothType;
-typedef enum {RUGESTUBEN=0,SYMMETRIC=1} StrengthType;
-typedef enum {COARSEEXACT=0,COARSEOAS=1} CoarseType;
-
-//multigrid preconditioner
-class multigrid_t: public precon_t {
-public:
-  platform_t& platform;
-  settings_t& settings;
-  MPI_Comm comm;
-
-  bool exact;
-  linearSolver_t *linearSolver=nullptr;
-
-  CycleType ctype;
-  AggType aggtype;
-  StrengthType strtype;
-  CoarseType coarsetype;
-
-  int numLevels=0;
-  int baseLevel=0;
-  multigridLevel *levels[PARALMOND_MAX_LEVELS];
-
-  occa::memory o_rhs[PARALMOND_MAX_LEVELS];
-  occa::memory o_x[PARALMOND_MAX_LEVELS];
-
-  coarseSolver_t *coarseSolver;
-
-  //scratch space for smoothing and temporary residual vector
-  size_t scratchSpaceBytes=0;
-  occa::memory o_scratch;
-
-  KrylovType ktype;
-
-  occa::memory o_ck[PARALMOND_MAX_LEVELS];
-  occa::memory o_vk[PARALMOND_MAX_LEVELS];
-  occa::memory o_wk[PARALMOND_MAX_LEVELS];
-
-  //scratch space
-  size_t reductionScratchBytes=0;
-  void *reductionScratch=nullptr;
-  occa::memory h_reductionScratch;
-  occa::memory o_reductionScratch;
-
-  multigrid_t(platform_t& _platform, settings_t& _settings, MPI_Comm _comm);
-  ~multigrid_t();
-
-  void AddLevel(multigridLevel* level);
-
-  void Operator(occa::memory& o_RHS, occa::memory& o_X);
-
-  void vcycle(const int k, occa::memory& o_RHS, occa::memory& o_X);
-  void kcycle(const int k, occa::memory& o_RHS, occa::memory& o_X);
-
-private:
-  void kcycleOp1(multigridLevel* level,
-                 occa::memory& o_X,  occa::memory& o_RHS,
-                 occa::memory& o_CK, occa::memory& o_VK,
-                 dfloat *alpha1, dfloat *rho1,
-                 dfloat *norm_rhs, dfloat *norm_rhstilde);
-
-  void kcycleOp2(multigridLevel* level,
-                occa::memory& o_X,  occa::memory& o_RHS,
-                occa::memory& o_CK, occa::memory& o_VK, occa::memory& o_WK,
-                const dfloat alpha1, const dfloat rho1);
-
-  void kcycleCombinedOp1(multigridLevel* level, dfloat *aDotbc, occa::memory& o_a,
-                       occa::memory& o_b, occa::memory& o_c);
-  void kcycleCombinedOp2(multigridLevel* level, dfloat *aDotbcd,
-                        occa::memory& o_a, occa::memory& o_b,
-                        occa::memory& o_c, occa::memory& o_d);
-  dfloat vectorAddInnerProd(multigridLevel* level,
-                          const dfloat alpha, occa::memory& o_x,
-                          const dfloat beta,  occa::memory& o_y);
-};
-
-}
-
-#endif
\ No newline at end of file
diff --git a/include/parAlmond/parAlmondparCSR.hpp b/include/parAlmond/parAlmondparCSR.hpp
index 48a89c3f0..893a1ee07 100644
--- a/include/parAlmond/parAlmondparCSR.hpp
+++ b/include/parAlmond/parAlmondparCSR.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,30 +27,32 @@ SOFTWARE.
 #ifndef PARALMOND_PARCSR_HPP
 #define PARALMOND_PARCSR_HPP
 
+namespace libp {
+
 namespace parAlmond {
 
 class parCSR {
 public:
-  platform_t& platform;
-  MPI_Comm comm;
+  platform_t platform;
+  comm_t comm;
 
-  dlong Nrows;
-  dlong Ncols;
+  dlong Nrows=0;
+  dlong Ncols=0;
 
   //local sparse matrix
   struct CSR {
     dlong nnz=0;
     dlong NrowBlocks=0;
 
-    dlong  *blockRowStarts=nullptr;
-    dlong  *rowStarts=nullptr;
-    dlong  *cols=nullptr;
-    pfloat *vals=nullptr;
+    memory<dlong>  blockRowStarts;
+    memory<dlong>  rowStarts;
+    memory<dlong>  cols;
+    memory<pfloat> vals;
 
-    occa::memory o_blockRowStarts;
-    occa::memory o_rowStarts;
-    occa::memory o_cols;
-    occa::memory o_vals;
+    deviceMemory<dlong>  o_blockRowStarts;
+    deviceMemory<dlong>  o_rowStarts;
+    deviceMemory<dlong>  o_cols;
+    deviceMemory<pfloat> o_vals;
   };
   CSR diag;
 
@@ -60,47 +62,46 @@ class parCSR {
     dlong nzRows=0;
     dlong NrowBlocks=0;
 
-    dlong  *blockRowStarts=nullptr;
-    dlong  *rowStarts=nullptr;
-    dlong  *mRowStarts=nullptr; //compressed version of rowStarts
-    dlong  *rows=nullptr;
-    dlong  *cols=nullptr;
-    pfloat *vals=nullptr;
-
-    occa::memory o_blockRowStarts;
-    occa::memory o_mRowStarts;
-    occa::memory o_rows;
-    occa::memory o_cols;
-    occa::memory o_vals;
+    memory<dlong>  blockRowStarts;
+    memory<dlong>  rowStarts;
+    memory<dlong>  mRowStarts; //compressed version of rowStarts
+    memory<dlong>  rows;
+    memory<dlong>  cols;
+    memory<pfloat> vals;
+
+    deviceMemory<dlong>  o_blockRowStarts;
+    deviceMemory<dlong>  o_mRowStarts;
+    deviceMemory<dlong>  o_rows;
+    deviceMemory<dlong>  o_cols;
+    deviceMemory<pfloat> o_vals;
   };
   MCSR offd;
 
-  dfloat *diagA=nullptr;
-  dfloat *diagInv=nullptr;
+  memory<dfloat> diagA;
+  memory<dfloat> diagInv;
 
-  occa::memory o_diagA;
-  occa::memory o_diagInv;
+  deviceMemory<dfloat> o_diagA;
+  deviceMemory<dfloat> o_diagInv;
 
   //partition info
-  hlong *globalRowStarts=nullptr;
-  hlong *globalColStarts=nullptr;
-  hlong *colMap=nullptr;
+  memory<hlong> globalRowStarts;
+  memory<hlong> globalColStarts;
+  memory<hlong> colMap;
 
-  halo_t *halo = nullptr;
+  ogs::halo_t halo;
   dlong NlocalCols = 0;
 
   //rho ~= cond(invD * A)
   dfloat rho=0.0;
 
-  parCSR(dlong N, dlong M, platform_t& _platform, MPI_Comm _comm):
+  parCSR() = default;
+  parCSR(dlong N, dlong M, platform_t& _platform, comm_t _comm):
     platform(_platform), comm(_comm), Nrows(N), Ncols(M) {}
 
   //build a parCSR matrix from a distributed COO matrix
   parCSR(parCOO& A);
 
-  ~parCSR();
-
-  void haloSetup(hlong *colIds);
+  void haloSetup(memory<hlong> colIds);
 
   void diagSetup();
 
@@ -108,28 +109,28 @@ class parCSR {
 
   void syncToDevice();
 
-  void SpMV(const dfloat alpha, dfloat *x,
-            const dfloat beta, dfloat *y);
-  void SpMV(const dfloat alpha, dfloat *x,
-            const dfloat beta, const dfloat *y, dfloat *z);
+  void SpMV(const dfloat alpha, memory<dfloat>& x,
+            const dfloat beta, memory<dfloat>& y);
+  void SpMV(const dfloat alpha, memory<dfloat>& x,
+            const dfloat beta, const memory<dfloat>& y, memory<dfloat>& z);
 
-  void SpMV(const dfloat alpha, occa::memory& o_x, const dfloat beta,
-            occa::memory& o_y);
-  void SpMV(const dfloat alpha, occa::memory& o_x, const dfloat beta,
-            occa::memory& o_y, occa::memory& o_z);
+  void SpMV(const dfloat alpha, deviceMemory<dfloat>& o_x, const dfloat beta,
+            deviceMemory<dfloat>& o_y);
+  void SpMV(const dfloat alpha, deviceMemory<dfloat>& o_x, const dfloat beta,
+            deviceMemory<dfloat>& o_y, deviceMemory<dfloat>& o_z);
 
-  void smoothDampedJacobi(occa::memory& o_r, occa::memory& o_x,
+  void smoothDampedJacobi(deviceMemory<dfloat>& o_r, deviceMemory<dfloat>& o_x,
                           const dfloat lambda, bool x_is_zero,
-                          occa::memory& o_scratch);
+                          deviceMemory<dfloat>& o_scratch);
 
-  void smoothChebyshev(occa::memory& o_b, occa::memory& o_x,
+  void smoothChebyshev(deviceMemory<dfloat>& o_b, deviceMemory<dfloat>& o_x,
                        const dfloat lambda0, const dfloat lambda1,
-                       bool x_is_zero, occa::memory& o_scratch,
+                       bool x_is_zero, deviceMemory<dfloat>& o_scratch,
                        const int ChebyshevIterations);
 };
 
-
-
 } //namespace parAlmond
 
+} //namespace libp
+
 #endif
diff --git a/include/platform.hpp b/include/platform.hpp
index 4eaffef08..0dc1761c3 100644
--- a/include/platform.hpp
+++ b/include/platform.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -28,40 +28,58 @@ SOFTWARE.
 #define PLATFORM_HPP
 
 #define LIBP_MAJOR_VERSION 0
-#define LIBP_MINOR_VERSION 4
+#define LIBP_MINOR_VERSION 5
 #define LIBP_PATCH_VERSION 0
-#define LIBP_VERSION       00400
-#define LIBP_VERSION_STR   "0.4.0"
+#define LIBP_VERSION       00500
+#define LIBP_VERSION_STR   "0.5.0"
 
 #include "core.hpp"
+#include "memory.hpp"
+#include "comm.hpp"
 #include "settings.hpp"
 #include "linAlg.hpp"
 
+namespace libp {
+
 class platformSettings_t: public settings_t {
 public:
-  platformSettings_t(MPI_Comm _comm);
+  platformSettings_t(comm_t _comm);
   void report();
 };
 
+namespace internal {
+
+class iplatform_t {
+public:
+  platformSettings_t settings;
+  properties_t props;
+
+  iplatform_t(platformSettings_t& _settings):
+    settings(_settings) {
+  }
+};
+
+} //namespace internal
+
 class platform_t {
 public:
-  const MPI_Comm& comm;
-  platformSettings_t& settings;
-  occa::properties props;
+  private:
+  std::shared_ptr<internal::iplatform_t> iplatform;
+  std::shared_ptr<linAlg_t> ilinAlg;
 
-  occa::device device;
-  linAlg_t linAlg;
+ public:
+  comm_t comm;
+  device_t device;
 
-  int rank, size;
+  platform_t()=default;
 
-  platform_t(platformSettings_t& _settings):
-    comm(_settings.comm),
-    settings(_settings) {
+  platform_t(platformSettings_t& _settings) {
+
+    iplatform = std::make_shared<internal::iplatform_t>(_settings);
 
-    MPI_Comm_rank(comm, &rank);
-    MPI_Comm_size(comm, &size);
+    comm = settings().comm;
 
-    if (rank==0) {
+    if (comm.rank()==0) {
       std::cout << "\n";
       std::cout << "\033[1m";
       std::cout << " _ _ _     ____                                             _ \n";
@@ -80,44 +98,140 @@ class platform_t {
     DeviceConfig();
     DeviceProperties();
 
-    linAlg.Setup(this);
+    ilinAlg = std::make_shared<linAlg_t>(this);
+  }
+
+  platform_t(const platform_t &other)=default;
+  platform_t& operator = (const platform_t &other)=default;
+
+  bool isInitialized() {
+    return (iplatform!=nullptr);
+  }
+
+  void assertInitialized() {
+    LIBP_ABORT("Platform not initialized.",
+                  !isInitialized());
+  }
+
+  kernel_t buildKernel(std::string fileName, std::string kernelName,
+                       properties_t& kernelInfo);
+
+  template <typename T>
+  deviceMemory<T> malloc(const size_t count,
+                         const properties_t &prop = properties_t()) {
+    assertInitialized();
+    if (occa::dtype::get<T>() == occa::dtype::none) {
+      return deviceMemory<T>(device.malloc(count*sizeof(T), occa::dtype::byte, prop));
+    } else {
+      return deviceMemory<T>(device.malloc<T>(count, prop));
+    }
+  }
+
+  template <typename T>
+  deviceMemory<T> malloc(const size_t count,
+                         const memory<T> src,
+                         const properties_t &prop = properties_t()) {
+    assertInitialized();
+    if (occa::dtype::get<T>() == occa::dtype::none) {
+      return deviceMemory<T>(device.malloc(count*sizeof(T), occa::dtype::byte, src.ptr(), prop));
+    } else {
+      return deviceMemory<T>(device.malloc<T>(count, src.ptr(), prop));
+    }
+  }
+
+  template <typename T>
+  deviceMemory<T> malloc(const memory<T> src,
+                         const properties_t &prop = properties_t()) {
+    assertInitialized();
+    if (occa::dtype::get<T>() == occa::dtype::none) {
+      return deviceMemory<T>(device.malloc(src.size(), occa::dtype::byte, src.ptr(), prop));
+    } else {
+      return deviceMemory<T>(device.malloc<T>(src.length(), src.ptr(), prop));
+    }
+  }
+
+  template <typename T>
+  pinnedMemory<T> hostMalloc(const size_t count){
+    assertInitialized();
+    properties_t hostProp("host", true);
+    if (occa::dtype::get<T>() == occa::dtype::none) {
+      return pinnedMemory<T>(device.malloc(count*sizeof(T), occa::dtype::byte, nullptr, hostProp));
+    } else {
+      return pinnedMemory<T>(device.malloc<T>(count, nullptr, hostProp));
+    }
+  }
+
+  template <typename T>
+  pinnedMemory<T> hostMalloc(const size_t count,
+                             const memory<T> src){
+    assertInitialized();
+    properties_t hostProp("host", true);
+    if (occa::dtype::get<T>() == occa::dtype::none) {
+      return pinnedMemory<T>(device.malloc(count*sizeof(T), occa::dtype::byte, src.ptr(), hostProp));
+    } else {
+      return pinnedMemory<T>(device.malloc<T>(count, src.ptr(), hostProp));
+    }
   }
 
-  ~platform_t(){}
+  template <typename T>
+  pinnedMemory<T> hostMalloc(const memory<T> src){
+    assertInitialized();
+    properties_t hostProp("host", true);
+    if (occa::dtype::get<T>() == occa::dtype::none) {
+      return pinnedMemory<T>(device.malloc(src.size(), occa::dtype::byte, src.ptr(), hostProp));
+    } else {
+      return pinnedMemory<T>(device.malloc<T>(src.length(), src.ptr(), hostProp));
+    }
+  }
 
-  occa::kernel buildKernel(std::string fileName, std::string kernelName,
-                           occa::properties& kernelInfo);
+  linAlg_t& linAlg() {
+    assertInitialized();
+    return *ilinAlg;
+  }
 
-  occa::memory malloc(const size_t bytes,
-                      const void *src = NULL,
-                      const occa::properties &prop = occa::properties()) {
-    return device.malloc(bytes, src, prop);
+  settings_t& settings() {
+    assertInitialized();
+    return iplatform->settings;
   }
 
-  occa::memory malloc(const size_t bytes,
-                      const occa::memory &src,
-                      const occa::properties &prop = occa::properties()) {
-    return device.malloc(bytes, src, prop);
+  properties_t& props() {
+    assertInitialized();
+    return iplatform->props;
   }
 
-  occa::memory malloc(const size_t bytes,
-                      const occa::properties &prop) {
-    return device.malloc(bytes, prop);
+  void finish() {
+    device.finish();
   }
 
-  void *hostMalloc(const size_t bytes,
-                   const void *src,
-                   occa::memory &h_mem){
-    occa::properties hostProp;
-    hostProp["host"] = true;
-    h_mem = device.malloc(bytes, src, hostProp);
-    return h_mem.ptr();
+  stream_t getStream() {
+    return device.getStream();
   }
 
-private:
+  void setStream(stream_t stream) {
+    device.setStream(stream);
+  }
+
+  const int rank() const {
+    return comm.rank();
+  }
+
+  const int size() const {
+    return comm.size();
+  }
+
+  int getDeviceCount(const std::string mode) {
+    return occa::getDeviceCount(mode);
+  }
+
+  void setCacheDir(const std::string cacheDir) {
+    occa::env::setOccaCacheDir(cacheDir);
+  }
+
+ private:
   void DeviceConfig();
   void DeviceProperties();
-
 };
 
-#endif
\ No newline at end of file
+} //namespace libp
+
+#endif
diff --git a/include/precon.hpp b/include/precon.hpp
index d3975b7f0..4be0e32ae 100644
--- a/include/precon.hpp
+++ b/include/precon.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -28,28 +28,46 @@ SOFTWARE.
 #define PRECON_HPP
 
 #include "core.hpp"
+#include "operator.hpp"
 
-//base preconditioner
-class precon_t {
-public:
-  precon_t() {};
+namespace libp {
+
+/*Abstracted Preconditioner Object*/
+class precon_t: public operator_t {
+ public:
+  void Operator(deviceMemory<dfloat> &o_r, deviceMemory<dfloat> &o_Mr) {
+    assertInitialized();
+    precon->Operator(o_r, o_Mr);
+  }
 
-  virtual void Operator(occa::memory &o_r, occa::memory &o_Mr)=0;
+  /*Generic setup. Create a Precon object and wrap it in a shared_ptr*/
+  template<class Precon, class... Args>
+  void Setup(Args&& ... args) {
+    precon = std::make_shared<Precon>(args...);
+  }
 
-  virtual ~precon_t() {}
+ private:
+  std::shared_ptr<operator_t> precon=nullptr;
+
+  void assertInitialized() {
+    LIBP_ABORT("Precon not initialized",
+               precon==nullptr);
+  }
 };
 
 //Identity operator
-class IdentityPrecon: public precon_t {
+class IdentityPrecon: public operator_t {
 private:
   dlong N;
 
 public:
   IdentityPrecon(dlong _N): N(_N) {}
 
-  void Operator(occa::memory &o_r, occa::memory &o_Mr){
-    o_Mr.copyFrom(o_r, N*sizeof(dfloat)); //identity
+  void Operator(deviceMemory<dfloat> &o_r, deviceMemory<dfloat> &o_Mr){
+    o_Mr.copyFrom(o_r, N); //identity
   }
 };
 
+} //namespace libp
+
 #endif
diff --git a/include/settings.hpp b/include/settings.hpp
index aa404f64a..843e5c55e 100644
--- a/include/settings.hpp
+++ b/include/settings.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -28,28 +28,30 @@ SOFTWARE.
 #define SETTINGS_HPP
 
 #include <string>
+#include <cstring>
 #include <vector>
 #include <ostream>
 #include <sstream>
 #include <fstream>
 #include "core.hpp"
 
-using std::string;
-using std::vector;
-using std::ostream;
-using std::stringstream;
+namespace libp {
 
 class setting_t {
+  using string = std::string;
+  using stringstream = std::stringstream;
+
 private:
   string name;
   string val;
 
   string description;
-  vector<string> options;
+  std::vector<string> options;
 
 public:
   setting_t() = default;
-  setting_t(string name_, string val_, string description_="", vector<string> options_={});
+  setting_t(string name_, string val_,
+            string description_="", std::vector<string> options_={});
 
   ~setting_t() = default;
 
@@ -58,7 +60,7 @@ class setting_t {
 
   const string& getName() const;
   const string& getDescription() const;
-  const vector<string>& getOptions() const;
+  const std::vector<string>& getOptions() const;
 
   template<typename T>
   T getVal() const {
@@ -75,28 +77,29 @@ class setting_t {
   string toString() const;
 };
 
-ostream& operator<<(ostream& os, const setting_t& setting);
+std::ostream& operator<<(std::ostream& os, const setting_t& setting);
 
 class settings_t {
+  using string = std::string;
+  using stringstream = std::stringstream;
+
 private:
-  vector<string> insertOrder;
+  std::vector<string> insertOrder;
 
 public:
-  const MPI_Comm comm;
-  std::map<string, setting_t*> settings;
+  comm_t comm;
+  std::map<string, setting_t> settings;
 
-  settings_t() = delete;
-  settings_t(MPI_Comm _comm);
-
-  ~settings_t();
+  settings_t() = default;
+  settings_t(comm_t _comm);
 
   //copy
-  settings_t(const settings_t& other);
-  settings_t& operator=(const settings_t& other);
+  settings_t(const settings_t& other)=default;
+  settings_t& operator=(const settings_t& other)=default;
 
   void newSetting(const string name, const string val,
-                          const string description="",
-                          const vector<string> options={});
+                  const string description="",
+                  const std::vector<string> options={});
 
   bool hasSetting(const string name);
 
@@ -109,12 +112,10 @@ class settings_t {
   void getSetting(const string name, T& value) const {
     auto search = settings.find(name);
     if (search != settings.end()) {
-      setting_t* val = search->second;
-      value = val->getVal<T>();
+      const setting_t& val = search->second;
+      value = val.getVal<T>();
     } else {
-      stringstream ss;
-      ss << "Unable to find setting: [" << name << "]";
-      LIBP_ABORT(ss.str());
+      LIBP_FORCE_ABORT("Unable to find setting: [" << name << "]");
     }
   }
 
@@ -127,6 +128,6 @@ class settings_t {
   void reportSetting(const string name) const;
 };
 
-
+} //namespace libp
 
 #endif
diff --git a/include/solver.hpp b/include/solver.hpp
index 4117b5c9e..d2fc37239 100644
--- a/include/solver.hpp
+++ b/include/solver.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -29,77 +29,82 @@ SOFTWARE.
 
 #include "settings.hpp"
 #include "platform.hpp"
+#include "operator.hpp"
 
-class solver_t {
+namespace libp {
+
+class solver_t: public operator_t {
 public:
-  platform_t& platform;
-  settings_t& settings;
+  platform_t platform;
+  settings_t settings;
+  comm_t comm;
 
-  solver_t() = delete;
+  solver_t() = default;
 
-  solver_t(platform_t& _platform, settings_t& _settings):
+  solver_t(platform_t& _platform, settings_t& _settings, comm_t _comm):
     platform(_platform),
-    settings(_settings) {};
-
-  virtual ~solver_t(){}
+    settings(_settings),
+    comm(_comm) {};
 
   virtual void Run() {
-    LIBP_ABORT(string("Run not implemented in this solver"))
+    LIBP_FORCE_ABORT("Run not implemented in this solver");
   };
-  virtual void Report(dfloat time=0.0, int tstep=0) {
-    LIBP_ABORT(string("Report not implemented in this solver"))
+  virtual void Report(dfloat time, int tstep) {
+    LIBP_FORCE_ABORT("Report not implemented in this solver");
   }
 
   //Full rhs evaluation of solver in form dq/dt = rhsf(q,t)
-  virtual void rhsf(occa::memory& o_q, occa::memory& o_rhs, const dfloat time) {
-    LIBP_ABORT(string("rhsf not implemented in this solver"))
+  virtual void rhsf(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_rhs, const dfloat time) {
+    LIBP_FORCE_ABORT("rhsf not implemented in this solver");
   }
 
   // Partial rhs evaluation of f with solver in form dq/dt = f(q,t) + g(q,t)
-  virtual void rhs_imex_f(occa::memory& o_q, occa::memory& o_rhs, const dfloat time) {
-    LIBP_ABORT(string("rhs_imex_f not implemented in this solver"))
+  virtual void rhs_imex_f(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_rhs, const dfloat time) {
+    LIBP_FORCE_ABORT("rhs_imex_f not implemented in this solver");
   }
 
   // Partial rhs evaluation of g with solver in form dq/dt = f(q,t) + g(q,t)
-  virtual void rhs_imex_g(occa::memory& o_q, occa::memory& o_rhs, const dfloat time) {
-    LIBP_ABORT(string("rhs_imex_g not implemented in this solver"))
+  virtual void rhs_imex_g(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_rhs, const dfloat time) {
+    LIBP_FORCE_ABORT("rhs_imex_g not implemented in this solver");
   }
 
   // Inversion of g function with solver in form dq/dt = f(q,t) + g(q,t)
   //  Solves gamma*q - g(q,t) = rhs for q
-  virtual void rhs_imex_invg(occa::memory& o_rhs, occa::memory& o_q, const dfloat gamma, const dfloat time) {
-    LIBP_ABORT(string("rhs_imex_invg not implemented in this solver"))
+  virtual void rhs_imex_invg(deviceMemory<dfloat>& o_rhs, deviceMemory<dfloat>& o_q, const dfloat gamma, const dfloat time) {
+    LIBP_FORCE_ABORT("rhs_imex_invg not implemented in this solver");
   }
 
   // Evolve rhs f function via a sub-timestepper
-  virtual void rhs_subcycle_f(occa::memory& o_Q, occa::memory& o_QHAT,
-                           const dfloat T, const dfloat dt, const dfloat* B,
+  virtual void rhs_subcycle_f(deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_QHAT,
+                           const dfloat T, const dfloat dt, const memory<dfloat> B,
                            const int order, const int shiftIndex, const int maxOrder) {
-    LIBP_ABORT(string("Subcycling not implemented in this solver"))
+    LIBP_FORCE_ABORT("Subcycling not implemented in this solver");
   }
 
   //Full rhs evaluation of solver in form dq/dt = rhsf(q,t) for multi-rate timestepping
-  virtual void rhsf_MR(occa::memory& o_q, occa::memory& o_rhs, occa::memory& o_fQM, const dfloat time, const int level) {
-    LIBP_ABORT(string("rhsf_MR not implemented in this solver"))
+  virtual void rhsf_MR(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_rhs, deviceMemory<dfloat>& o_fQM, const dfloat time, const int level) {
+    LIBP_FORCE_ABORT("rhsf_MR not implemented in this solver");
   }
 
   //Full rhs evaluation of solver in form dq/dt = rhsf(q,t) with a perfectly matched layer (PML)
-  virtual void rhsf_pml(occa::memory& o_q, occa::memory& o_pmlq,
-                        occa::memory& o_rhs, occa::memory& o_pmlrhs, const dfloat time) {
-    LIBP_ABORT(string("rhsf_pml not implemented in this solver"))
+  virtual void rhsf_pml(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_pmlq,
+                        deviceMemory<dfloat>& o_rhs, deviceMemory<dfloat>& o_pmlrhs, const dfloat time) {
+    LIBP_FORCE_ABORT("rhsf_pml not implemented in this solver");
   }
 
   //Full rhs evaluation of solver in form dq/dt = rhsf(q,t) for multi-rate timestepping with a PML
-  virtual void rhsf_MR_pml(occa::memory& o_q, occa::memory& o_pmlq,
-                           occa::memory& o_rhs, occa::memory& o_pmlrhs,
-                           occa::memory& o_fQM, const dfloat time, const int level) {
-    LIBP_ABORT(string("rhsf_MR_pml not implemented in this solver"))
+  virtual void rhsf_MR_pml(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_pmlq,
+                           deviceMemory<dfloat>& o_rhs, deviceMemory<dfloat>& o_pmlrhs,
+                           deviceMemory<dfloat>& o_fQM, const dfloat time, const int level) {
+    LIBP_FORCE_ABORT("rhsf_MR_pml not implemented in this solver");
   }
 
   //Evaluation of solver as a operator in the form A(q)
-  virtual void Operator(occa::memory& o_q, occa::memory& o_Aq) {
-    LIBP_ABORT(string("Operator not implemented in this solver"))
+  virtual void Operator(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_Aq) {
+    LIBP_FORCE_ABORT("Operator not implemented in this solver");
   }
 };
 
-#endif
\ No newline at end of file
+} //namespace libp
+
+#endif
diff --git a/include/timeStepper.hpp b/include/timeStepper.hpp
index 153b21561..f8d5e79b3 100644
--- a/include/timeStepper.hpp
+++ b/include/timeStepper.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -32,101 +32,140 @@ SOFTWARE.
 #include "mesh.hpp"
 #include "solver.hpp"
 
+namespace libp {
+
+//forward declare
+namespace TimeStepper { class timeStepperBase_t; }
+
+/* General TimeStepper object*/
+class timeStepper_t {
+ public:
+  timeStepper_t() = default;
+
+  /*Generic setup. Create a Stepper object and wrap it in a shared_ptr*/
+  template<class Stepper, class... Args>
+  void Setup(Args&& ... args) {
+    ts = std::make_shared<Stepper>(args...);
+  }
+
+  void Run(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat start, dfloat end);
+
+  void SetTimeStep(dfloat dt_);
+
+  dfloat GetTimeStep();
+
+  dfloat GetGamma();
+
+ private:
+  std::shared_ptr<TimeStepper::timeStepperBase_t> ts=nullptr;
+
+  void assertInitialized();
+};
+
 namespace TimeStepper {
 
 //base time stepper class
-class timeStepper_t {
+class timeStepperBase_t {
 public:
+  platform_t platform;
+  comm_t comm;
+
   dlong N;
   dlong Nhalo;
 
-  solver_t& solver;
-
   dfloat dt;
 
-  timeStepper_t(dlong Nelements, dlong NhaloElements,
-                 int Np, int Nfields, solver_t& _solver):
+  timeStepperBase_t(dlong Nelements, dlong NhaloElements,
+                    int Np, int Nfields,
+                    platform_t& _platform, comm_t _comm):
+    platform(_platform),
+    comm(_comm),
     N(Nelements*Np*Nfields),
-    Nhalo(NhaloElements*Np*Nfields),
-    solver(_solver) {}
+    Nhalo(NhaloElements*Np*Nfields) {}
 
-  virtual ~timeStepper_t() {};
-  virtual void Run(occa::memory& o_q, dfloat start, dfloat end)=0;
+  virtual void Run(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat start, dfloat end)=0;
 
   void SetTimeStep(dfloat dt_) {dt = dt_;};
+
   dfloat GetTimeStep() {return dt;};
+
+  virtual dfloat GetGamma() {
+    LIBP_FORCE_ABORT("GetGamma() not available in this Timestepper");
+    return 0.0;
+  }
 };
 
 /* Adams Bashforth, order 3 */
-class ab3: public timeStepper_t {
+class ab3: public timeStepperBase_t {
 protected:
   int Nstages;
   int shiftIndex;
 
-  dfloat *ab_a;
-  occa::memory o_ab_a;
+  memory<dfloat> ab_a;
+  deviceMemory<dfloat> o_ab_a;
 
-  occa::memory o_rhsq;
+  deviceMemory<dfloat> o_rhsq;
 
-  occa::kernel updateKernel;
+  kernel_t updateKernel;
 
-  virtual void Step(occa::memory& o_q, dfloat time, dfloat dt, int order);
+  virtual void Step(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat time, dfloat dt, int order);
 
 public:
   ab3(dlong Nelements, dlong NhaloElements,
-      int Np, int Nfields, solver_t& solver);
-  ~ab3();
+      int Np, int Nfields,
+      platform_t& _platform, comm_t _comm);
 
-  void Run(occa::memory& o_q, dfloat start, dfloat end);
+  void Run(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat start, dfloat end);
 };
 
 /* Low-Storage Explicit Runge-Kutta, order 4 */
-class lserk4: public timeStepper_t {
+class lserk4: public timeStepperBase_t {
 protected:
   int Nrk;
-  dfloat *rka, *rkb, *rkc;
+  memory<dfloat> rka, rkb, rkc;
+
+  deviceMemory<dfloat> o_rhsq;
+  deviceMemory<dfloat> o_resq;
 
-  occa::memory o_rhsq;
-  occa::memory o_resq;
+  deviceMemory<dfloat> o_saveq;
 
-  occa::kernel updateKernel;
+  kernel_t updateKernel;
 
-  virtual void Step(occa::memory& o_q, dfloat time, dfloat dt);
+  virtual void Step(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat time, dfloat dt);
 
 public:
   lserk4(dlong Nelements, dlong NhaloElements,
-         int Np, int Nfields, solver_t& solver);
-  ~lserk4();
+         int Np, int Nfields,
+         platform_t& _platform, comm_t _comm);
 
-  void Run(occa::memory& o_q, dfloat start, dfloat end);
+  void Run(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat start, dfloat end);
 };
 
 /* Dormand-Prince method */
 /* Explict Runge-Kutta, order 5 with embedded order 4 and adaptive time-stepping */
-class dopri5: public timeStepper_t {
+class dopri5: public timeStepperBase_t {
 protected:
-  MPI_Comm comm;
   int Nrk;
 
   dlong Nblock;
 
-  dfloat *rkC, *rkA, *rkE;
-  occa::memory o_rkA, o_rkE;
+  memory<dfloat> rkC, rkA, rkE;
+  deviceMemory<dfloat> o_rkA, o_rkE;
 
-  dfloat *errtmp;
-  occa::memory o_errtmp, h_errtmp;
+  deviceMemory<dfloat> o_errtmp;
+  pinnedMemory<dfloat> h_errtmp;
 
-  occa::memory o_rhsq;
-  occa::memory o_rkq;
-  occa::memory o_rkrhsq;
-  occa::memory o_rkerr;
+  deviceMemory<dfloat> o_rhsq;
+  deviceMemory<dfloat> o_rkq;
+  deviceMemory<dfloat> o_rkrhsq;
+  deviceMemory<dfloat> o_rkerr;
 
-  occa::memory o_saveq;
+  deviceMemory<dfloat> o_saveq;
 
 
-  occa::kernel rkUpdateKernel;
-  occa::kernel rkStageKernel;
-  occa::kernel rkErrorEstimateKernel;
+  kernel_t rkUpdateKernel;
+  kernel_t rkStageKernel;
+  kernel_t rkErrorEstimateKernel;
 
   dfloat dtMIN; //minumum allowed timestep
   dfloat ATOL;  //absolute error tolerance
@@ -144,24 +183,24 @@ class dopri5: public timeStepper_t {
   dfloat facold;
   dfloat sqrtinvNtotal;
 
-  virtual void Backup(occa::memory &o_Q);
-  virtual void Restore(occa::memory &o_Q);
-  virtual void AcceptStep(occa::memory &o_q, occa::memory &o_rq);
+  virtual void Backup(deviceMemory<dfloat> &o_Q);
+  virtual void Restore(deviceMemory<dfloat> &o_Q);
+  virtual void AcceptStep(deviceMemory<dfloat> &o_q, deviceMemory<dfloat> &o_rq);
 
-  virtual void Step(occa::memory& o_q, dfloat time, dfloat dt);
+  virtual void Step(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat time, dfloat dt);
 
-  virtual dfloat Estimater(occa::memory& o_q);
+  virtual dfloat Estimater(deviceMemory<dfloat>& o_q);
 
 public:
   dopri5(dlong Nelements, dlong NhaloElements,
-         int Np, int Nfields, solver_t& solver, MPI_Comm _comm);
-  ~dopri5();
+         int Np, int Nfields,
+         platform_t& _platform, comm_t _comm);
 
-  void Run(occa::memory& o_q, dfloat start, dfloat end);
+  void Run(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat start, dfloat end);
 };
 
 /* Semi-Analytic Adams-Bashforth, order 3 */
-class saab3: public timeStepper_t {
+class saab3: public timeStepperBase_t {
 protected:
   int Nstages;
   int shiftIndex;
@@ -169,59 +208,56 @@ class saab3: public timeStepper_t {
   int Np, Nfields;
   dlong Nblock, Nelements, NhaloElements;
 
-  dfloat *lambda;
+  memory<dfloat> lambda;
 
-  dfloat *saab_x, *saab_a;
-  occa::memory o_saab_x, o_saab_a;
+  pinnedMemory<dfloat> h_saab_x, h_saab_a;
+  deviceMemory<dfloat> o_saab_x, o_saab_a;
 
-  occa::memory o_rhsq;
+  deviceMemory<dfloat> o_rhsq;
 
-  occa::kernel updateKernel;
+  kernel_t updateKernel;
 
-  virtual void Step(occa::memory& o_q, dfloat time, dfloat dt, int order);
+  virtual void Step(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat time, dfloat dt, int order);
 
   virtual void UpdateCoefficients();
 
 public:
   saab3(dlong _Nelements, dlong _NhaloElements,
         int _Np, int _Nfields,
-        dfloat *_lambda,
-        solver_t& _solver);
-  ~saab3();
+        memory<dfloat> _lambda,
+        platform_t& _platform, comm_t _comm);
 
-  void Run(occa::memory& o_q, dfloat start, dfloat end);
+  void Run(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat start, dfloat end);
 };
 
 /* Semi-Analytic Explict Runge-Kutta, order 4 with embedded order 3 and adaptive time-stepping */
-class sark4: public timeStepper_t {
+class sark4: public timeStepperBase_t {
 protected:
-  MPI_Comm comm;
   int Nrk;
   int order, embeddedOrder;
 
   int Np, Nfields;
   dlong Nblock, Nelements, NhaloElements;
 
-  dfloat *lambda;
-
-  dfloat *rkC, *rkX, *rkA, *rkE;
-  occa::memory h_rkX, h_rkA, h_rkE;
-  occa::memory o_rkX, o_rkA, o_rkE;
+  memory<dfloat> lambda;
 
-  dfloat *errtmp;
+  memory<dfloat> rkC;
+  deviceMemory<dfloat> o_rkX, o_rkA, o_rkE;
+  pinnedMemory<dfloat> h_rkX, h_rkA, h_rkE;
 
-  occa::memory o_rhsq;
-  occa::memory o_rkq;
-  occa::memory o_rkrhsq;
-  occa::memory o_rkerr;
+  deviceMemory<dfloat> o_rhsq;
+  deviceMemory<dfloat> o_rkq;
+  deviceMemory<dfloat> o_rkrhsq;
+  deviceMemory<dfloat> o_rkerr;
 
-  occa::memory o_saveq;
+  deviceMemory<dfloat> o_saveq;
 
-  occa::memory o_errtmp;
+  deviceMemory<dfloat> o_errtmp;
+  pinnedMemory<dfloat> h_errtmp;
 
-  occa::kernel rkUpdateKernel;
-  occa::kernel rkStageKernel;
-  occa::kernel rkErrorEstimateKernel;
+  kernel_t rkUpdateKernel;
+  kernel_t rkStageKernel;
+  kernel_t rkErrorEstimateKernel;
 
   dfloat dtMIN; //minumum allowed timestep
   dfloat ATOL;  //absolute error tolerance
@@ -239,56 +275,54 @@ class sark4: public timeStepper_t {
   dfloat facold;
   dfloat sqrtinvNtotal;
 
-  virtual void Backup(occa::memory &o_Q);
-  virtual void Restore(occa::memory &o_Q);
-  virtual void AcceptStep(occa::memory &o_q, occa::memory &o_rq);
+  virtual void Backup(deviceMemory<dfloat> &o_Q);
+  virtual void Restore(deviceMemory<dfloat> &o_Q);
+  virtual void AcceptStep(deviceMemory<dfloat> &o_q, deviceMemory<dfloat> &o_rq);
 
-  virtual void Step(occa::memory& o_q, dfloat time, dfloat dt);
+  virtual void Step(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat time, dfloat dt);
 
-  dfloat Estimater(occa::memory& o_q);
+  dfloat Estimater(deviceMemory<dfloat>& o_q);
 
   void UpdateCoefficients();
 
 public:
   sark4(dlong _Nelements, dlong _NhaloElements,
         int _Np, int _Nfields,
-        dfloat *_lambda,
-        solver_t& _solver, MPI_Comm _comm);
-  ~sark4();
+        memory<dfloat> _lambda,
+        platform_t& _platform, comm_t _comm);
 
-  void Run(occa::memory& o_q, dfloat start, dfloat end);
+  void Run(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat start, dfloat end);
 };
 
 /* Semi-Analytic Explict Runge-Kutta, order 5 with embedded order 4 and adaptive time-stepping */
-class sark5: public timeStepper_t {
+class sark5: public timeStepperBase_t {
 protected:
-  MPI_Comm comm;
   int Nrk;
   int order, embeddedOrder;
 
   int Np, Nfields;
   dlong Nblock, Nelements, NhaloElements;
 
-  dfloat *lambda;
+  memory<dfloat> lambda;
 
-  dfloat *rkC, *rkX, *rkA, *rkE;
-  occa::memory h_rkX, h_rkA, h_rkE;
-  occa::memory o_rkX, o_rkA, o_rkE;
+  memory<dfloat> rkC;
+  deviceMemory<dfloat> o_rkX, o_rkA, o_rkE;
+  pinnedMemory<dfloat> h_rkX, h_rkA, h_rkE;
 
-  dfloat *errtmp;
 
-  occa::memory o_rhsq;
-  occa::memory o_rkq;
-  occa::memory o_rkrhsq;
-  occa::memory o_rkerr;
+  deviceMemory<dfloat> o_rhsq;
+  deviceMemory<dfloat> o_rkq;
+  deviceMemory<dfloat> o_rkrhsq;
+  deviceMemory<dfloat> o_rkerr;
 
-  occa::memory o_saveq;
+  deviceMemory<dfloat> o_saveq;
 
-  occa::memory o_errtmp;
+  deviceMemory<dfloat> o_errtmp;
+  pinnedMemory<dfloat> h_errtmp;
 
-  occa::kernel rkUpdateKernel;
-  occa::kernel rkStageKernel;
-  occa::kernel rkErrorEstimateKernel;
+  kernel_t rkUpdateKernel;
+  kernel_t rkStageKernel;
+  kernel_t rkErrorEstimateKernel;
 
   dfloat dtMIN; //minumum allowed timestep
   dfloat ATOL;  //absolute error tolerance
@@ -306,154 +340,151 @@ class sark5: public timeStepper_t {
   dfloat facold;
   dfloat sqrtinvNtotal;
 
-  virtual void Backup(occa::memory &o_Q);
-  virtual void Restore(occa::memory &o_Q);
-  virtual void AcceptStep(occa::memory &o_q, occa::memory &o_rq);
+  virtual void Backup(deviceMemory<dfloat> &o_Q);
+  virtual void Restore(deviceMemory<dfloat> &o_Q);
+  virtual void AcceptStep(deviceMemory<dfloat> &o_q, deviceMemory<dfloat> &o_rq);
 
-  virtual void Step(occa::memory& o_q, dfloat time, dfloat dt);
+  virtual void Step(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat time, dfloat dt);
 
-  dfloat Estimater(occa::memory& o_q);
+  dfloat Estimater(deviceMemory<dfloat>& o_q);
 
   void UpdateCoefficients();
 
 public:
   sark5(dlong _Nelements, dlong _NhaloElements,
         int _Np, int _Nfields,
-        dfloat *_lambda,
-        solver_t& _solver, MPI_Comm _comm);
-  ~sark5();
+        memory<dfloat> _lambda,
+        platform_t& _platform, comm_t _comm);
 
-  void Run(occa::memory& o_q, dfloat start, dfloat end);
+  void Run(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat start, dfloat end);
 };
 
 /* Backward Difference Formula, order 3, with extrapolation */
-class extbdf3: public timeStepper_t {
+class extbdf3: public timeStepperBase_t {
 protected:
   int Nstages;
   int shiftIndex;
 
-  dfloat *extbdf_a;
-  dfloat *extbdf_b;
-  occa::memory o_extbdf_a;
-  occa::memory o_extbdf_b;
+  memory<dfloat> extbdf_a;
+  memory<dfloat> extbdf_b;
+  deviceMemory<dfloat> o_extbdf_a;
+  deviceMemory<dfloat> o_extbdf_b;
 
-  occa::memory o_rhs;
-  occa::memory o_qn;
-  occa::memory o_F;
+  deviceMemory<dfloat> o_rhs;
+  deviceMemory<dfloat> o_qn;
+  deviceMemory<dfloat> o_F;
 
-  occa::kernel rhsKernel;
+  kernel_t rhsKernel;
 
-  virtual void Step(occa::memory& o_q, dfloat time, dfloat dt, int order);
+  virtual void Step(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat time, dfloat dt, int order);
 
 public:
   extbdf3(dlong Nelements, dlong NhaloElements,
-      int Np, int Nfields, solver_t& solver);
-  ~extbdf3();
+      int Np, int Nfields,
+      platform_t& _platform, comm_t _comm);
 
-  dfloat getGamma();
+  dfloat GetGamma();
 
-  void Run(occa::memory& o_q, dfloat start, dfloat end);
+  void Run(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat start, dfloat end);
 };
 
 /* Backward Difference Formula, order 3, with subcycling */
-class ssbdf3: public timeStepper_t {
+class ssbdf3: public timeStepperBase_t {
 protected:
   int Nstages;
   int shiftIndex;
 
-  dfloat *ssbdf_b;
-  occa::memory o_ssbdf_b;
+  memory<dfloat> ssbdf_b;
+  deviceMemory<dfloat> o_ssbdf_b;
 
-  occa::memory o_rhs;
-  occa::memory o_qn;
-  occa::memory o_qhat;
+  deviceMemory<dfloat> o_rhs;
+  deviceMemory<dfloat> o_qn;
+  deviceMemory<dfloat> o_qhat;
 
-  occa::kernel rhsKernel;
+  kernel_t rhsKernel;
 
-  virtual void Step(occa::memory& o_q, dfloat time, dfloat dt, int order);
+  virtual void Step(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat time, dfloat dt, int order);
 
 public:
   ssbdf3(dlong Nelements, dlong NhaloElements,
-      int Np, int Nfields, solver_t& solver);
-  ~ssbdf3();
+      int Np, int Nfields,
+      platform_t& _platform, comm_t _comm);
 
-  dfloat getGamma();
+  dfloat GetGamma();
 
-  void Run(occa::memory& o_q, dfloat start, dfloat end);
+  void Run(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat start, dfloat end);
 };
 
 /* Multi-rate Adams-Bashforth, order 3 */
-class mrab3: public timeStepper_t {
+class mrab3: public timeStepperBase_t {
 protected:
-  mesh_t &mesh;
+  mesh_t mesh;
 
   int Nstages;
   int Nlevels;
   int Nfields;
 
-  int* shiftIndex;
-  occa::memory o_shiftIndex, h_shiftIndex;
+  deviceMemory<int> o_shiftIndex;
+  deviceMemory<int> h_shiftIndex;
 
-  dfloat *mrdt;
-  occa::memory o_mrdt;
+  memory<dfloat> mrdt;
+  deviceMemory<dfloat> o_mrdt;
 
-  dfloat *ab_a, *ab_b;
-  occa::memory o_ab_a, o_ab_b;
+  memory<dfloat> ab_a, ab_b;
+  deviceMemory<dfloat> o_ab_a, o_ab_b;
 
-  occa::memory o_rhsq0, o_rhsq, o_fQM;
+  deviceMemory<dfloat> o_rhsq0, o_rhsq, o_fQM;
 
-  occa::kernel updateKernel;
-  occa::kernel traceUpdateKernel;
+  kernel_t updateKernel;
+  kernel_t traceUpdateKernel;
 
-  virtual void Step(occa::memory& o_q, dfloat time, dfloat dt, int order);
+  virtual void Step(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat time, dfloat dt, int order);
 
 public:
   mrab3(dlong _Nelements, dlong _NhaloElements,
          int _Np, int _Nfields,
-         solver_t& _solver, mesh_t& _mesh);
-  ~mrab3();
+         platform_t& _platform, mesh_t& _mesh);
 
-  void Run(occa::memory& o_q, dfloat start, dfloat end);
+  void Run(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat start, dfloat end);
 };
 
 /* Multi-rate Semi-Analytic Adams-Bashforth, order 3 */
-class mrsaab3: public timeStepper_t {
+class mrsaab3: public timeStepperBase_t {
 protected:
-  mesh_t &mesh;
+  mesh_t mesh;
 
   int Nstages;
   int Nlevels;
   int Nfields;
 
-  dfloat *lambda;
+  memory<dfloat> lambda;
 
-  int* shiftIndex;
-  occa::memory o_shiftIndex, h_shiftIndex;
+  deviceMemory<int> o_shiftIndex;
+  pinnedMemory<int> h_shiftIndex;
 
-  dfloat *mrdt;
-  occa::memory o_mrdt;
+  memory<dfloat> mrdt;
+  deviceMemory<dfloat> o_mrdt;
 
-  dfloat *saab_x, *saab_a, *saab_b;
-  occa::memory o_saab_x, o_saab_a, o_saab_b;
+  memory<dfloat> saab_x, saab_a, saab_b;
+  deviceMemory<dfloat> o_saab_x, o_saab_a, o_saab_b;
 
-  occa::memory o_rhsq0, o_rhsq, o_fQM;
+  deviceMemory<dfloat> o_rhsq0, o_rhsq, o_fQM;
 
-  occa::kernel updateKernel;
-  occa::kernel traceUpdateKernel;
+  kernel_t updateKernel;
+  kernel_t traceUpdateKernel;
 
-  virtual void Step(occa::memory& o_q, dfloat time, dfloat dt, int order);
+  virtual void Step(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat time, dfloat dt, int order);
 
   void UpdateCoefficients();
 
 public:
   mrsaab3(dlong _Nelements, dlong _NhaloElements,
          int _Np, int _Nfields,
-         dfloat *_lambda,
-         solver_t& _solver, mesh_t& _mesh);
-  ~mrsaab3();
+         memory<dfloat> _lambda,
+         platform_t& _platform, mesh_t& _mesh);
 
   void Init();
-  void Run(occa::memory& o_q, dfloat start, dfloat end);
+  void Run(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat start, dfloat end);
 };
 
 
@@ -466,15 +497,15 @@ class ab3_pml: public ab3 {
 private:
   dlong Npml;
 
-  occa::memory o_pmlq;
-  occa::memory o_rhspmlq;
+  deviceMemory<dfloat> o_pmlq;
+  deviceMemory<dfloat> o_rhspmlq;
 
-  void Step(occa::memory& o_q, dfloat time, dfloat dt, int order);
+  void Step(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat time, dfloat dt, int order);
 
 public:
   ab3_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements,
-          int Np, int Nfields, int Npmlfields, solver_t& solver);
-  ~ab3_pml();
+          int Np, int Nfields, int Npmlfields,
+          platform_t& _platform, comm_t _comm);
 };
 
 /* Low-Storage Explicit Runge-Kutta, order 4 */
@@ -482,16 +513,16 @@ class lserk4_pml: public lserk4 {
 private:
   dlong Npml;
 
-  occa::memory o_pmlq;
-  occa::memory o_rhspmlq;
-  occa::memory o_respmlq;
+  deviceMemory<dfloat> o_pmlq;
+  deviceMemory<dfloat> o_rhspmlq;
+  deviceMemory<dfloat> o_respmlq;
 
-  void Step(occa::memory& o_q, dfloat time, dfloat dt);
+  void Step(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat time, dfloat dt);
 
 public:
   lserk4_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements,
-            int Np, int Nfields, int Npmlfields, solver_t& solver);
-  ~lserk4_pml();
+            int Np, int Nfields, int Npmlfields,
+            platform_t& _platform, comm_t _comm);
 };
 
 /* Dormand-Prince method */
@@ -500,25 +531,25 @@ class dopri5_pml: public dopri5 {
 private:
   dlong Npml;
 
-  occa::memory o_pmlq;
-  occa::memory o_rhspmlq;
-  occa::memory o_rkpmlq;
-  occa::memory o_rkrhspmlq;
+  deviceMemory<dfloat> o_pmlq;
+  deviceMemory<dfloat> o_rhspmlq;
+  deviceMemory<dfloat> o_rkpmlq;
+  deviceMemory<dfloat> o_rkrhspmlq;
 
-  occa::memory o_savepmlq;
+  deviceMemory<dfloat> o_savepmlq;
 
-  occa::kernel rkPmlUpdateKernel;
+  kernel_t rkPmlUpdateKernel;
 
-  void Backup(occa::memory &o_Q);
-  void Restore(occa::memory &o_Q);
-  void AcceptStep(occa::memory &o_q, occa::memory &o_rq);
+  void Backup(deviceMemory<dfloat> &o_Q);
+  void Restore(deviceMemory<dfloat> &o_Q);
+  void AcceptStep(deviceMemory<dfloat> &o_q, deviceMemory<dfloat> &o_rq);
 
-  void Step(occa::memory& o_q, dfloat time, dfloat dt);
+  void Step(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat time, dfloat dt);
 
 public:
   dopri5_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements,
-            int Np, int Nfields, int Npmlfields, solver_t& solver, MPI_Comm _comm);
-  ~dopri5_pml();
+            int Np, int Nfields, int Npmlfields,
+            platform_t& _platform, comm_t _comm);
 };
 
 /* Semi-Analytic Adams-Bashforth, order 3 */
@@ -527,21 +558,21 @@ class saab3_pml: public saab3 {
 private:
   dlong Npml;
 
-  dfloat *pmlsaab_x, *pmlsaab_a;
-  occa::memory o_pmlsaab_x, o_pmlsaab_a;
+  memory<dfloat> pmlsaab_x, pmlsaab_a;
+  deviceMemory<dfloat> o_pmlsaab_x, o_pmlsaab_a;
 
-  occa::memory o_pmlq;
-  occa::memory o_rhspmlq;
+  deviceMemory<dfloat> o_pmlq;
+  deviceMemory<dfloat> o_rhspmlq;
 
-  occa::kernel pmlUpdateKernel;
+  kernel_t pmlUpdateKernel;
 
-  void Step(occa::memory& o_q, dfloat time, dfloat dt, int order);
+  void Step(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat time, dfloat dt, int order);
 
 public:
   saab3_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements,
             int Np, int Nfields, int _Npmlfields,
-            dfloat *_lambda, solver_t& solver);
-  ~saab3_pml();
+            memory<dfloat> _lambda,
+            platform_t& _platform, comm_t _comm);
 };
 
 /* Semi-Analytic Explict Runge-Kutta, order 4 with embedded order 3 and adaptive time-stepping */
@@ -550,30 +581,29 @@ class sark4_pml: public sark4 {
 private:
   dlong Npml;
 
-  dfloat *pmlrkA;
-  occa::memory o_pmlrkA;
+  memory<dfloat> pmlrkA;
+  deviceMemory<dfloat> o_pmlrkA;
 
-  occa::memory o_pmlq;
-  occa::memory o_rhspmlq;
-  occa::memory o_rkpmlq;
-  occa::memory o_rkrhspmlq;
+  deviceMemory<dfloat> o_pmlq;
+  deviceMemory<dfloat> o_rhspmlq;
+  deviceMemory<dfloat> o_rkpmlq;
+  deviceMemory<dfloat> o_rkrhspmlq;
 
-  occa::memory o_savepmlq;
+  deviceMemory<dfloat> o_savepmlq;
 
-  occa::kernel rkPmlUpdateKernel;
-  occa::kernel rkPmlStageKernel;
+  kernel_t rkPmlUpdateKernel;
+  kernel_t rkPmlStageKernel;
 
-  void Backup(occa::memory &o_Q);
-  void Restore(occa::memory &o_Q);
-  void AcceptStep(occa::memory &o_q, occa::memory &o_rq);
+  void Backup(deviceMemory<dfloat> &o_Q);
+  void Restore(deviceMemory<dfloat> &o_Q);
+  void AcceptStep(deviceMemory<dfloat> &o_q, deviceMemory<dfloat> &o_rq);
 
-  void Step(occa::memory& o_q, dfloat time, dfloat dt);
+  void Step(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat time, dfloat dt);
 
 public:
   sark4_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements,
             int Np, int Nfields, int _Npmlfields,
-            dfloat *_lambda, solver_t& solver, MPI_Comm _comm);
-  ~sark4_pml();
+            memory<dfloat> _lambda, platform_t& _platform, comm_t _comm);
 };
 
 /* Semi-Analytic Explict Runge-Kutta, order 5 with embedded order 4 and adaptive time-stepping */
@@ -582,30 +612,29 @@ class sark5_pml: public sark5 {
 private:
   dlong Npml;
 
-  dfloat *pmlrkA;
-  occa::memory o_pmlrkA;
+  memory<dfloat> pmlrkA;
+  deviceMemory<dfloat> o_pmlrkA;
 
-  occa::memory o_pmlq;
-  occa::memory o_rhspmlq;
-  occa::memory o_rkpmlq;
-  occa::memory o_rkrhspmlq;
+  deviceMemory<dfloat> o_pmlq;
+  deviceMemory<dfloat> o_rhspmlq;
+  deviceMemory<dfloat> o_rkpmlq;
+  deviceMemory<dfloat> o_rkrhspmlq;
 
-  occa::memory o_savepmlq;
+  deviceMemory<dfloat> o_savepmlq;
 
-  occa::kernel rkPmlUpdateKernel;
-  occa::kernel rkPmlStageKernel;
+  kernel_t rkPmlUpdateKernel;
+  kernel_t rkPmlStageKernel;
 
-  void Backup(occa::memory &o_Q);
-  void Restore(occa::memory &o_Q);
-  void AcceptStep(occa::memory &o_q, occa::memory &o_rq);
+  void Backup(deviceMemory<dfloat> &o_Q);
+  void Restore(deviceMemory<dfloat> &o_Q);
+  void AcceptStep(deviceMemory<dfloat> &o_q, deviceMemory<dfloat> &o_rq);
 
-  void Step(occa::memory& o_q, dfloat time, dfloat dt);
+  void Step(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat time, dfloat dt);
 
 public:
   sark5_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements,
             int Np, int Nfields, int _Npmlfields,
-            dfloat *_lambda, solver_t& solver, MPI_Comm _comm);
-  ~sark5_pml();
+            memory<dfloat> _lambda, platform_t& _platform, comm_t _comm);
 };
 
 
@@ -615,17 +644,16 @@ class mrab3_pml: public mrab3 {
   dlong Npml;
   int Npmlfields;
 
-  occa::memory o_pmlq;
-  occa::memory o_rhspmlq0, o_rhspmlq;
+  deviceMemory<dfloat> o_pmlq;
+  deviceMemory<dfloat> o_rhspmlq0, o_rhspmlq;
 
-  occa::kernel pmlUpdateKernel;
+  kernel_t pmlUpdateKernel;
 
-  void Step(occa::memory& o_q, dfloat time, dfloat dt, int order);
+  void Step(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat time, dfloat dt, int order);
 
 public:
   mrab3_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements,
-            int Np, int Nfields, int _Npmlfields, solver_t& solver, mesh_t& _mesh);
-  ~mrab3_pml();
+            int Np, int Nfields, int _Npmlfields, platform_t& _platform, mesh_t& _mesh);
 };
 
 /* Multi-rate Semi-Analytic Adams-Bashforth, order 3 */
@@ -635,23 +663,25 @@ class mrsaab3_pml: public mrsaab3 {
   dlong Npml;
   int Npmlfields;
 
-  occa::memory o_pmlq;
-  dfloat *pmlsaab_a, *pmlsaab_b;
-  occa::memory o_pmlsaab_a, o_pmlsaab_b;
+  deviceMemory<dfloat> o_pmlq;
 
-  occa::memory o_rhspmlq0, o_rhspmlq;
+  memory<dfloat> pmlsaab_a, pmlsaab_b;
+  deviceMemory<dfloat> o_pmlsaab_a, o_pmlsaab_b;
 
-  occa::kernel pmlUpdateKernel;
+  deviceMemory<dfloat> o_rhspmlq0, o_rhspmlq;
 
-  void Step(occa::memory& o_q, dfloat time, dfloat dt, int order);
+  kernel_t pmlUpdateKernel;
+
+  void Step(solver_t& solver, deviceMemory<dfloat>& o_q, dfloat time, dfloat dt, int order);
 
 public:
   mrsaab3_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements,
             int Np, int Nfields, int _Npmlfields,
-            dfloat *_lambda, solver_t& solver, mesh_t& _mesh);
-  ~mrsaab3_pml();
+            memory<dfloat> _lambda, platform_t& _platform, mesh_t& _mesh);
 };
 
 } //namespace TimeStepper
 
+} //namespace libp
+
 #endif
diff --git a/include/timer.hpp b/include/timer.hpp
new file mode 100644
index 000000000..cf5a22c2a
--- /dev/null
+++ b/include/timer.hpp
@@ -0,0 +1,59 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#ifndef LIBP_TIMER_HPP
+#define LIBP_TIMER_HPP
+
+#include "core.hpp"
+#include "comm.hpp"
+#include "platform.hpp"
+#include <chrono>
+
+namespace libp {
+
+using timePoint_t = std::chrono::time_point<std::chrono::high_resolution_clock>;
+
+/* Host time*/
+timePoint_t Time();
+
+/* Host time after global sync*/
+timePoint_t GlobalTime(comm_t comm);
+
+/* Host time after platform sync*/
+timePoint_t PlatformTime(platform_t &platform);
+
+/* Host time after platform sync*/
+timePoint_t GlobalPlatformTime(platform_t &platform);
+
+/* Host time after platform sync*/
+timePoint_t GlobalPlatformTime(platform_t &platform, comm_t comm);
+
+/*Time between time points, in seconds*/
+double ElapsedTime(const timePoint_t start, const timePoint_t end);
+
+} //namespace libp
+
+#endif
diff --git a/include/types.h b/include/types.h
index 45f3bd372..b801ea38a 100644
--- a/include/types.h
+++ b/include/types.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -30,24 +30,18 @@ SOFTWARE.
 // precision of AMG storage
 #if 0
 #define pfloat float
-#define ogs_pfloat ogs_float
 #else
 #define pfloat double
-#define ogs_pfloat ogs_double
 #endif
 
 
 //float data type
 #if 0
 #define dfloat float
-#define ogs_dfloat ogs_float
-#define MPI_DFLOAT MPI_FLOAT
 #define dfloatFormat "%f"
 #define dfloatString "float"
 #else
 #define dfloat double
-#define ogs_dfloat ogs_double
-#define MPI_DFLOAT MPI_DOUBLE
 #define dfloatFormat "%lf"
 #define dfloatString "double"
 #endif
@@ -55,14 +49,10 @@ SOFTWARE.
 //host index data type
 #if 0
 #define hlong int
-#define ogs_hlong ogs_int
-#define MPI_HLONG MPI_INT
 #define hlongFormat "%d"
 #define hlongString "int"
 #else
 #define hlong long long int
-#define ogs_hlong ogs_long_long
-#define MPI_HLONG MPI_LONG_LONG_INT
 #define hlongFormat "%lld"
 #define hlongString "long long int"
 #endif
@@ -70,14 +60,10 @@ SOFTWARE.
 //device index data type
 #if 1
 #define dlong int
-#define ogs_dlong ogs_int
-#define MPI_DLONG MPI_INT
 #define dlongFormat "%d"
 #define dlongString "int"
 #else
 #define dlong long long int
-#define ogs_dlong ogs_longlongint
-#define MPI_DLONG MPI_LONG_LONG_INT
 #define dlongFormat "%lld"
 #define dlongString "long long int"
 #endif
diff --git a/include/utils.hpp b/include/utils.hpp
index 842066bbb..8d0ec5634 100644
--- a/include/utils.hpp
+++ b/include/utils.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,11 +27,25 @@ SOFTWARE.
 #ifndef UTILS_HPP
 #define UTILS_HPP
 
-#include <occa.hpp>
-#include <mpi.h>
 #include <string>
+#include <cstring>
+#include <ostream>
+#include <iostream>
+#include <cstddef>
+#include <memory>
+#include <algorithm>
+#include <typeinfo>
+#include <cmath>
+#include <occa.hpp>
 #include "types.h"
 
+namespace libp {
+
+using properties_t = occa::json;
+using device_t = occa::device;
+using kernel_t = occa::kernel;
+using stream_t = occa::stream;
+
 //error codes
 #define LIBP_SUCCESS 0
 #define LIBP_ERROR -1
@@ -40,37 +54,62 @@ SOFTWARE.
 #  define __PRETTY_FUNCTION__ __FUNCTION__
 #endif
 
-#define LIBP_ABORT2(filename, function, line, message)              \
-  {                                                                 \
-    std::string banner = "---[ Error ]";                            \
-    std::cerr << '\n'                                               \
-       << std::string(74, '=') << '\n'                              \
-       << banner << std::string(74 - banner.size(), '-') << '\n'    \
-       << "    File     : " << filename << '\n'                     \
-       << "    Line     : " << line     << '\n'                     \
-       << "    Function : " << function << '\n'                     \
-       << "    Message  : " << message  << '\n'                     \
-       << std::string(74, '=') << '\n';                             \
-    MPI_Abort(MPI_COMM_WORLD,LIBP_ERROR);                           \
-  }
-#define LIBP_ABORT(message) LIBP_ABORT2(__FILE__, __PRETTY_FUNCTION__, __LINE__, message)
-
-#define LIBP_WARNING(message)                                       \
-  {                                                                 \
-    std::string banner = "---[ Warning ]";                          \
-    std::cerr << '\n'                                               \
-       << std::string(74, '=') << '\n'                              \
-       << banner << std::string(74 - banner.size(), '-') << '\n'    \
-       << "     " << message  << '\n'                               \
-       << std::string(74, '=') << '\n';                             \
-  }
-
-#define mymax(a,b) (((a)>(b))?(a):(b))
-#define mymin(a,b) (((a)<(b))?(a):(b))
-
-// block size for reduction (hard coded)
-#define BLOCKSIZE 256
-
-
+#define LIBP_TEMPLATE_CHECK(checkFunction, expr, filename, function, line, message) \
+  do {                                                                  \
+    const bool isErr = (bool) (expr);                                   \
+    if (isErr) {                                                        \
+      std::stringstream _check_ss;                                      \
+      _check_ss << message;                                             \
+      checkFunction(filename, function, line, _check_ss.str());         \
+    }                                                                   \
+  } while (false)
+
+#define LIBP_ABORT3(expr, filename, function, line, message) LIBP_TEMPLATE_CHECK(libp::abort, expr, filename, function, line, message)
+#define LIBP_ABORT2(expr, filename, function, line, message) LIBP_ABORT3(expr, filename, function, line, message)
+#define LIBP_ABORT(message, expr)                            LIBP_ABORT2(expr, __FILE__, __PRETTY_FUNCTION__, __LINE__, message)
+
+#define LIBP_WARNING3(expr, filename, function, line, message) LIBP_TEMPLATE_CHECK(libp::warn, expr, filename, function, line, message)
+#define LIBP_WARNING2(expr, filename, function, line, message) LIBP_WARNING3(expr, filename, function, line, message)
+#define LIBP_WARNING(message, expr)                            LIBP_WARNING2(expr, __FILE__, __PRETTY_FUNCTION__, __LINE__, message)
+
+#define LIBP_FORCE_ABORT(message)   LIBP_ABORT(message, true)
+#define LIBP_FORCE_WARNING(message) LIBP_WARNING(message, true)
+
+class exception : public std::exception {
+ public:
+  const std::string header;
+  const std::string filename;
+  const std::string function;
+  const std::string message;
+  const int line;
+
+  std::string exceptionMessage;
+
+  exception(const std::string &header_,
+            const std::string &filename_,
+            const std::string &function_,
+            const int line_,
+            const std::string &message_);
+  ~exception() throw();
+
+  const char* what() const throw();
+  std::string toString() const;
+  std::string location() const;
+};
+
+std::ostream& operator << (std::ostream& out,
+                           const exception &exc);
+
+void abort(const std::string &filename,
+           const std::string &function,
+           const int line,
+           const std::string &message);
+
+void warn(const std::string &filename,
+          const std::string &function,
+          const int line,
+          const std::string &message);
+
+} //namespace libp
 
 #endif
diff --git a/libs/core/comm.cpp b/libs/core/comm.cpp
new file mode 100644
index 000000000..f96af932e
--- /dev/null
+++ b/libs/core/comm.cpp
@@ -0,0 +1,118 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "comm.hpp"
+
+namespace libp {
+
+namespace Comm {
+
+/*Static MPI_Init and MPI_Finalize*/
+void Init(int &argc, char** &argv) { MPI_Init(&argc, &argv); }
+void Finalize() { MPI_Finalize(); }
+
+/*Static handle to MPI_COMM_WORLD*/
+comm_t World() {
+  comm_t c;
+  c.comm_ptr = std::make_shared<MPI_Comm>();
+  *(c.comm_ptr) = MPI_COMM_WORLD;
+  MPI_Comm_rank(c.comm(), &(c._rank));
+  MPI_Comm_size(c.comm(), &(c._size));
+  return c;
+}
+
+void GetProcessorName(char* name, int &namelen) {
+  MPI_Get_processor_name(name,&namelen);
+}
+
+} //namespace Comm
+
+/*MPI_Comm_dup and free*/
+comm_t comm_t::Dup() const {
+  comm_t c;
+  /*Make a new comm shared_ptr, which will call MPI_Comm_free when destroyed*/
+  c.comm_ptr = std::shared_ptr<MPI_Comm>(new MPI_Comm,
+                                        [](MPI_Comm *comm) {
+                                          if (*comm != MPI_COMM_NULL)
+                                            MPI_Comm_free(comm);
+                                          delete comm;
+                                        });
+  MPI_Comm_dup(comm(), c.comm_ptr.get());
+  MPI_Comm_rank(c.comm(), &(c._rank));
+  MPI_Comm_size(c.comm(), &(c._size));
+  return c;
+}
+void comm_t::Free() {
+  comm_ptr = nullptr;
+  _rank=0;
+  _size=0;
+}
+/*Split*/
+comm_t comm_t::Split(const int color, const int key) const {
+  comm_t c;
+  /*Make a new comm shared_ptr, which will call MPI_Comm_free when destroyed*/
+  c.comm_ptr = std::shared_ptr<MPI_Comm>(new MPI_Comm,
+                                        [](MPI_Comm *comm) {
+                                          if (*comm != MPI_COMM_NULL)
+                                            MPI_Comm_free(comm);
+                                          delete comm;
+                                        });
+
+  MPI_Comm_split(comm(), color, key, c.comm_ptr.get());
+  MPI_Comm_rank(c.comm(), &(c._rank));
+  MPI_Comm_size(c.comm(), &(c._size));
+  return c;
+}
+
+/*Rank and size getters*/
+const int comm_t::rank() const {
+  return _rank;
+}
+const int comm_t::size() const {
+  return _size;
+}
+
+MPI_Comm comm_t::comm() const {
+  if (comm_ptr == nullptr) {
+    return MPI_COMM_NULL;
+  } else {
+    return *comm_ptr;
+  }
+}
+
+void comm_t::Wait(Comm::request_t &request) const {
+  MPI_Wait(&request, MPI_STATUS_IGNORE);
+}
+
+void comm_t::Waitall(const int count, memory<Comm::request_t> &requests) const {
+  MPI_Waitall(count, requests.ptr(), MPI_STATUSES_IGNORE);
+}
+
+void comm_t::Barrier() const {
+  MPI_Barrier(comm());
+}
+
+} //namespace libp
diff --git a/libs/core/exception.cpp b/libs/core/exception.cpp
new file mode 100644
index 000000000..2be2c7f12
--- /dev/null
+++ b/libs/core/exception.cpp
@@ -0,0 +1,90 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "utils.hpp"
+
+namespace libp {
+
+
+exception::exception(const std::string &header_,
+                      const std::string &filename_,
+                      const std::string &function_,
+                      const int line_,
+                      const std::string &message_) :
+    header(header_),
+    filename(filename_),
+    function(function_),
+    message(message_),
+    line(line_),
+    exceptionMessage(toString()) {}
+
+exception::~exception() throw() {}
+
+const char* exception::what() const throw() {
+  return exceptionMessage.c_str();
+}
+
+std::string exception::toString() const {
+  std::stringstream ss;
+  std::string banner = "---[ " + header + " ]";
+  ss << '\n'
+     << banner << std::string(80 - banner.size(), '-') << '\n'
+     << location()
+     << "    Message  : " << message << '\n'
+     << std::string(80, '=') << '\n';
+  return ss.str();
+}
+
+std::string exception::location() const {
+  std::stringstream ss;
+  ss << "    File     : " << filename << '\n'
+     << "    Line     : " << line     << '\n'
+     << "    Function : " << function << '\n';
+  return ss.str();
+}
+
+std::ostream& operator << (std::ostream& out,
+                           const exception &exc) {
+  out << exc.toString() << std::flush;
+  return out;
+}
+
+void abort(const std::string &filename,
+           const std::string &function,
+           const int line,
+           const std::string &message) {
+  throw exception("Error", filename, function, line, message);
+}
+
+void warn(const std::string &filename,
+          const std::string &function,
+          const int line,
+          const std::string &message) {
+  exception exp("Warning", filename, function, line, message);
+  std::cout << exp;
+}
+
+} //namespace libp
diff --git a/libs/core/factor.cpp b/libs/core/factor.cpp
deleted file mode 100644
index d61926f61..000000000
--- a/libs/core/factor.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "core.hpp"
-
-// find a factorization n = nx*ny such that
-//  nx>=ny are 'close' to one another
-void factor2(const int n, int &nx, int &ny) {
-  //start with guessing nx ~= n^1/2
-  nx = round(sqrt(n));
-  ny = 1;
-
-  for (;nx<n;nx++) {
-    if (n % nx ==0) { //if nx divides n
-      ny = n / nx; //divide out nx
-
-      //swap if needed
-      if (ny>nx) std::swap(nx,ny);
-
-      return;
-    }
-  }
-
-  //if we made it this far, n is prime
-  nx = n;
-}
-
-// find a factorization n = nx*ny*nz such that
-//  nx>=ny>=nz are all 'close' to one another
-void factor3(const int n, int &nx, int &ny, int &nz) {
-  //start with guessing nx ~= n^1/3
-  nx = round(std::cbrt(n));
-  ny = nz = 1;
-
-  for (;nx<n;nx++) {
-    if (n % nx ==0) { //if nx divides n
-      const int f = n / nx; //divide out nx
-
-      ny = round(sqrt(f)); //guess ny ~= sqrt(f)
-      for (;ny<f;ny++) {
-        if (f % ny == 0) { //if ny divides f
-          nz = f/ny; //divide out ny
-
-          //sort
-          if (ny>nx) std::swap(nx,ny);
-          if (nz>ny) std::swap(ny,nz);
-          if (ny>nx) std::swap(nx,ny);
-
-          return;
-        }
-      }
-
-      //if we're here, f is prime
-      ny = f;
-      nz = 1;
-
-      //swap if needed
-      if (ny>nx) std::swap(nx,ny);
-
-      return;
-    }
-  }
-
-  //if we made it this far, n is prime
-  nx = n;
-}
\ No newline at end of file
diff --git a/libs/core/matrixEig.cpp b/libs/core/matrixEig.cpp
deleted file mode 100644
index 075c5735b..000000000
--- a/libs/core/matrixEig.cpp
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "core.hpp"
-
-extern "C" {
-  void sgeev_(char *JOBVL, char *JOBVR, int *N, float *A, int *LDA, float *WR, float *WI,
-              float *VL, int *LDVL, float *VR, int *LDVR, float *WORK, int *LWORK, int *INFO );
-  void dgeev_(char *JOBVL, char *JOBVR, int *N, double *A, int *LDA, double *WR, double *WI,
-              double *VL, int *LDVL, double *VR, int *LDVR, double *WORK, int *LWORK, int *INFO );
-}
-
-// compute right eigenvectors
-void matrixEigenVectors(int N, double *A, double *VR, double *WR, double *WI){
-
-  char JOBVL = 'N';
-  char JOBVR = 'V';
-  int LDA = N;
-  int LDVL = N;
-  int LDVR = N;
-  int LWORK = 8*N;
-
-  double *VL = NULL;
-  double *WORK  = (double*) calloc(LWORK,sizeof(double));
-
-  double *tmpA  = (double*) calloc(N*N,sizeof(double));
-  double *tmpVR = (double*) calloc(N*N,sizeof(double));
-
-  for(int n=0;n<N;++n){
-    for(int m=0;m<N;++m){
-      tmpA[n+m*N] = A[n*N+m];
-    }
-  }
-
-  int INFO = -999;
-
-  dgeev_ (&JOBVL, &JOBVR, &N, tmpA, &LDA, WR, WI,
-          VL, &LDVL, tmpVR, &LDVR, WORK, &LWORK, &INFO);
-
-  if(INFO) {
-    std::stringstream ss;
-    ss << "dgeev_ reports info = " << INFO;
-    LIBP_ABORT(ss.str());
-  }
-
-  for(int n=0;n<N;++n){
-    for(int m=0;m<N;++m){
-      VR[n+m*N] = tmpVR[n*N+m];
-    }
-  }
-
-  free(tmpVR);
-  free(tmpA);
-  free(WORK);
-}
-
-// compute right eigenvectors
-void matrixEigenVectors(int N, float *A, float *VR, float *WR, float *WI){
-
-  char JOBVL = 'N';
-  char JOBVR = 'V';
-  int LDA = N;
-  int LDVL = N;
-  int LDVR = N;
-  int LWORK = 8*N;
-
-  float *VL = NULL;
-  float *WORK  = (float*) calloc(LWORK,sizeof(float));
-
-  float *tmpA  = (float*) calloc(N*N,sizeof(float));
-  float *tmpVR = (float*) calloc(N*N,sizeof(float));
-
-  for(int n=0;n<N;++n){
-    for(int m=0;m<N;++m){
-      tmpA[n+m*N] = A[n*N+m];
-    }
-  }
-
-  int INFO = -999;
-
-  sgeev_ (&JOBVL, &JOBVR, &N, tmpA, &LDA, WR, WI,
-          VL, &LDVL, tmpVR, &LDVR, WORK, &LWORK, &INFO);
-
-  if(INFO) {
-    std::stringstream ss;
-    ss << "sgeev_ reports info = " << INFO;
-    LIBP_ABORT(ss.str());
-  }
-
-  for(int n=0;n<N;++n){
-    for(int m=0;m<N;++m){
-      VR[n+m*N] = tmpVR[n*N+m];
-    }
-  }
-
-  free(tmpVR);
-  free(tmpA);
-  free(WORK);
-}
-
-// compute eigenvalues
-void matrixEigenValues(int N, double *A, double *WR, double *WI){
-
-  char JOBVL  = 'N';
-  char JOBVR  = 'N';
-  int LDA = N;
-  int LDVL = N;
-  int LDVR = N;
-  int LWORK = 8*N;
-
-  double *VR = nullptr;
-  double *VL = nullptr;
-  double *WORK  = (double*) calloc(LWORK,sizeof(double));
-
-  int INFO = -999;
-
-  dgeev_ (&JOBVL, &JOBVR, &N, A, &LDA, WR, WI,
-          VL, &LDVL, VR, &LDVR, WORK, &LWORK, &INFO);
-
-  if(INFO) {
-    std::stringstream ss;
-    ss << "dgeev_ reports info = " << INFO;
-    LIBP_ABORT(ss.str());
-  }
-
-  free(WORK);
-}
-
-// compute eigenvalues
-void matrixEigenValues(int N, float *A, float *WR, float *WI){
-
-  char JOBVL  = 'N';
-  char JOBVR  = 'N';
-  int LDA = N;
-  int LDVL = N;
-  int LDVR = N;
-  int LWORK = 8*N;
-
-  float *VR = nullptr;
-  float *VL = nullptr;
-  float *WORK  = (float*) calloc(LWORK,sizeof(float));
-
-  int INFO = -999;
-
-  sgeev_ (&JOBVL, &JOBVR, &N, A, &LDA, WR, WI,
-          VL, &LDVL, VR, &LDVR, WORK, &LWORK, &INFO);
-
-  if(INFO) {
-    std::stringstream ss;
-    ss << "sgeev_ reports info = " << INFO;
-    LIBP_ABORT(ss.str());
-  }
-
-  free(WORK);
-}
\ No newline at end of file
diff --git a/libs/core/matrixRightSolve.cpp b/libs/core/matrixRightSolve.cpp
deleted file mode 100644
index 3a3bb64d4..000000000
--- a/libs/core/matrixRightSolve.cpp
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "core.hpp"
-
-extern "C" {
-  void dgesv_ ( int     *N, int     *NRHS, double  *A,
-                int     *LDA,
-                int     *IPIV,
-                double  *B,
-                int     *LDB,
-                int     *INFO );
-
-  void sgesv_ ( int     *N, int     *NRHS, float  *A,
-                int     *LDA,
-                int     *IPIV,
-                float  *B,
-                int     *LDB,
-                int     *INFO );
-
-  void dgels_ ( char   *TRANS,
-                int    *M,
-                int    *N,
-                int    *NRHS,
-                double *A,
-                int    *LDA,
-                double *B,
-                int    *LDB,
-                double *WORK,
-                int    *LWORK,
-                int    *INFO);
-
-  void dgeqp3_( int    *M,
-                int    *N,
-                double *A,
-                int    *LDA,
-                int    *JPVT,
-                double *TAU,
-                double *WORK,
-                int    *LWORK,
-                int    *INFO);
-
-  void dormqr_( char   *SIDE,
-                char   *TRANS,
-                int    *M,
-                int    *N,
-                int    *K,
-                double *A,
-                int    *LDA,
-                double *TAU,
-                double *C,
-                int    *LDC,
-                double *WORK,
-                int    *LWORK,
-                int    *INFO);
-
-  void dtrsm_ ( char   *SIDE,
-                char   *UPLO,
-                char   *TRANSA,
-                char   *DIAG,
-                int    *M,
-                int    *N,
-                double *ALPHA,
-                double *A,
-                int    *LDA,
-                double *B,
-                int    *LDB);
-}
-
-// C = A/B  = trans(trans(B)\trans(A))
-// assume row major
-void matrixRightSolve(int NrowsA, int NcolsA, double *A, int NrowsB, int NcolsB, double *B, double *C){
-
-  int info;
-
-  int NrowsX = NcolsB;
-  int NcolsX = NrowsB;
-
-  int NrowsY = NcolsA;
-  int NcolsY = NrowsA;
-
-  int lwork = NrowsX*NcolsX;
-
-  // compute inverse mass matrix
-  double *tmpX = (double*) calloc(NrowsX*NcolsX, sizeof(double));
-  double *tmpY = (double*) calloc(NrowsY*NcolsY, sizeof(double));
-
-  int    *ipiv = (int*) calloc(NrowsX, sizeof(int));
-  double *work = (double*) calloc(lwork, sizeof(double));
-
-  for(int n=0;n<NrowsX*NcolsX;++n){
-    tmpX[n] = B[n];
-  }
-
-  for(int n=0;n<NrowsY*NcolsY;++n){
-    tmpY[n] =A[n];
-  }
-
-  dgesv_(&NrowsX, &NcolsY, tmpX, &NrowsX, ipiv, tmpY, &NrowsY, &info); // ?
-
-  if(info) {
-    std::stringstream ss;
-    ss << "dgesv_ reports info = " << info;
-    LIBP_ABORT(ss.str());
-  }
-
-  for(int n=0;n<NrowsY*NcolsY;++n){
-    C[n] = tmpY[n];
-  }
-
-  free(work);
-  free(ipiv);
-  free(tmpX);
-  free(tmpY);
-}
-
-// C = A/B  = trans(trans(B)\trans(A))
-// assume row major
-void matrixRightSolve(int NrowsA, int NcolsA, float *A, int NrowsB, int NcolsB, float *B, float *C){
-
-  int info;
-
-  int NrowsX = NcolsB;
-  int NcolsX = NrowsB;
-
-  int NrowsY = NcolsA;
-  int NcolsY = NrowsA;
-
-  int lwork = NrowsX*NcolsX;
-
-  // compute inverse mass matrix
-  float *tmpX = (float*) calloc(NrowsX*NcolsX, sizeof(float));
-  float *tmpY = (float*) calloc(NrowsY*NcolsY, sizeof(float));
-
-  int    *ipiv = (int*) calloc(NrowsX, sizeof(int));
-  float *work = (float*) calloc(lwork, sizeof(float));
-
-  for(int n=0;n<NrowsX*NcolsX;++n){
-    tmpX[n] = B[n];
-  }
-
-  for(int n=0;n<NrowsY*NcolsY;++n){
-    tmpY[n] =A[n];
-  }
-
-  sgesv_(&NrowsX, &NcolsY, tmpX, &NrowsX, ipiv, tmpY, &NrowsY, &info); // ?
-
-  if(info) {
-    std::stringstream ss;
-    ss << "sgesv_ reports info = " << info;
-    LIBP_ABORT(ss.str());
-  }
-
-  for(int n=0;n<NrowsY*NcolsY;++n){
-    C[n] = tmpY[n];
-  }
-
-  free(work);
-  free(ipiv);
-  free(tmpX);
-  free(tmpY);
-}
-
-// Find minimum-norm solution to xA = b with NrowsA > NcolsA (underdetermined).
-//
-// NB:  A must be stored ROW MAJOR.
-void matrixUnderdeterminedRightSolveMinNorm(int NrowsA, int NcolsA, dfloat *A, dfloat *b, dfloat *x)
-{
-  int     LWORK, INFO = 0;
-  dfloat* WORK;
-
-  dfloat* tmpA = new dfloat[NrowsA*NcolsA]();
-  for (int i = 0; i < NrowsA*NcolsA; i++)
-    tmpA[i] = A[i];
-
-  dfloat* tmpb = new dfloat[NrowsA]();
-  for (int i = 0; i < NcolsA; i++)
-    tmpb[i] = b[i];
-
-  // Solve A^T x^T = b^T.  Note TRANS = 'N', since A is row major.
-  char TRANS = 'N';
-  int  NRHS = 1;
-
-  LWORK = 2*NrowsA*NcolsA;
-  WORK = new dfloat[LWORK]();
-  dgels_(&TRANS, &NcolsA, &NrowsA, &NRHS, tmpA, &NcolsA, tmpb, &NrowsA, WORK, &LWORK, &INFO);
-
-  if (INFO != 0) {
-    std::stringstream ss;
-    ss << "dgels_ returned INFO = " << INFO;
-    LIBP_ABORT(ss.str());
-  }
-
-  // Copy to output.
-  for (int i = 0; i < NrowsA; i++)
-    x[i] = tmpb[i];
-
-  delete[] WORK;
-  delete[] tmpA;
-  delete[] tmpb;
-}
-
-// Solve xA = b with NrowsA > NcolsA (underdetermined) using column-pivoted QR.
-//
-// Done by solving A^T x^T = b^T in 4 steps:
-//   1.  Decompose A^T * P = Q * R.  -->  Q * R * P^T x^T = b^T
-//   2.  Multiply by Q^T.            -->  R * P^T x^T = Q^T b^T
-//   3.  Backsolve with R1.          -->  P^T * x^T = R1^{-1} Q^T b^T
-//       where R1 = leading NcolsA * NcolsA submatrix of R.
-//   4.  Apply permutation.          -->  x^T = P R1^{-1} Q^T b^T
-//
-// NB:  A must be stored ROW MAJOR.
-void matrixUnderdeterminedRightSolveCPQR(int NrowsA, int NcolsA, dfloat *A, dfloat *b, dfloat *x)
-{
-  int     LWORK, INFO = 0;
-  dfloat* WORK;
-
-  dfloat* tmpA = new dfloat[NrowsA*NcolsA]();
-  for (int i = 0; i < NrowsA*NcolsA; i++)
-    tmpA[i] = A[i];
-
-  dfloat* tmpb = new dfloat[NrowsA]();
-  for (int i = 0; i < NcolsA; i++)
-    tmpb[i] = b[i];
-
-  // Compute A^T * P = Q * R.
-  int*    JPVT = new int[NrowsA]();
-  dfloat* TAU = new dfloat[mymin(NrowsA, NcolsA)]();
-
-  LWORK = 3*NrowsA + 1;
-  WORK  = new dfloat[LWORK]();
-  dgeqp3_(&NcolsA, &NrowsA, tmpA, &NcolsA, JPVT, TAU, WORK, &LWORK, &INFO);
-
-  if (INFO != 0) {
-    std::stringstream ss;
-    ss << "dgeqp3_ returned INFO = " << INFO;
-    LIBP_ABORT(ss.str());
-  }
-
-  delete[] WORK;
-
-  // Compute Q^T * b^T.
-  char SIDE = 'L';
-  char TRANS = 'T';
-  int  NRHS = 1;
-  int  NREFLS = NcolsA;
-
-  LWORK = 1;
-  WORK  = new dfloat[LWORK]();
-  dormqr_(&SIDE, &TRANS, &NcolsA, &NRHS, &NREFLS, tmpA, &NcolsA, TAU, tmpb, &NcolsA, WORK, &LWORK, &INFO);
-
-  if (INFO != 0) {
-    std::stringstream ss;
-    ss << "dormqr_ returned INFO = " << INFO;
-    LIBP_ABORT(ss.str());
-  }
-
-  delete[] WORK;
-
-  // Compute R1^{-1} * Q^T * b^T
-  SIDE = 'L';
-  char UPLO = 'U';
-  char TRANSA = 'N';
-  char DIAG = 'N';
-  NRHS = 1;
-  dfloat ALPHA = 1.0;
-
-  dtrsm_(&SIDE, &UPLO, &TRANSA, &DIAG, &NcolsA, &NRHS, &ALPHA, tmpA, &NcolsA, tmpb, &NcolsA);
-
-  // Apply the permutation.
-  for (int i = 0; i < NrowsA; i++)
-    x[JPVT[i] - 1] = tmpb[i];
-
-  delete[] JPVT;
-  delete[] TAU;
-  delete[] tmpA;
-  delete[] tmpb;
-}
diff --git a/libs/mesh/meshOccaSetupQuad2D.cpp b/libs/core/memory.cpp
similarity index 65%
rename from libs/mesh/meshOccaSetupQuad2D.cpp
rename to libs/core/memory.cpp
index fd26e9ef2..69c8ef897 100644
--- a/libs/mesh/meshOccaSetupQuad2D.cpp
+++ b/libs/core/memory.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -24,21 +24,18 @@ SOFTWARE.
 
 */
 
-#include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
-
-void meshQuad2D::OccaSetup(){
-
-  this->mesh2D::OccaSetup();
-
-  o_D = platform.malloc(Nq*Nq*sizeof(dfloat), D);
-
-  o_S    = o_D; //dummy
-  o_MM   = o_D; //dummy
-  o_sM   = o_D; //dummy
-  o_LIFT = o_D; //dummy
-
-  o_vgeo = platform.malloc((Nelements+totalHaloPairs)*Nvgeo*Np*sizeof(dfloat), vgeo);
-  o_sgeo = platform.malloc(Nelements*Nfaces*Nfp*Nsgeo*sizeof(dfloat), sgeo);
-  o_ggeo = platform.malloc(Nelements*Np*Nggeo*sizeof(dfloat), ggeo);
-}
+#include "memory.hpp"
+
+namespace libp {
+/*explicit instantiation of common specializations*/
+template class memory<int>;
+template class memory<long long int>;
+template class memory<float>;
+template class memory<double>;
+
+/*explicit instantiation of common specializations*/
+template class deviceMemory<int>;
+template class deviceMemory<long long int>;
+template class deviceMemory<float>;
+template class deviceMemory<double>;
+} //namespace libp
diff --git a/libs/core/parallelSort.cpp b/libs/core/parallelSort.cpp
deleted file mode 100644
index 76e3b97a9..000000000
--- a/libs/core/parallelSort.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-/* use this for int */
-#include "mesh.hpp"
-
-void mergeLists(size_t sz,
-		int N1, char *v1,
-		int N2, char *v2,
-		char *v3,
-		int (*compare)(const void *, const void *),
-		void (*match)(void *, void *)){
-
-  int n1 = 0, n2 = 0, n3 = 0;
-
-  // merge two lists from v1 and v2
-  for(n3=0;n3<N1+N2;++n3){
-    if(n1<N1 && n2<N2){
-      int c = compare(v1+n1*sz,v2+n2*sz);
-      if(c==-1){
-	memcpy(v3+n3*sz, v1+n1*sz, sz);
-	++n1;
-      }
-      else{
-	memcpy(v3+n3*sz, v2+n2*sz, sz);
-	++n2;
-      }
-    }
-    else if(n1<N1){
-      memcpy(v3+n3*sz, v1+n1*sz, sz);
-      ++n1;
-    }
-    else if(n2<N2){
-      memcpy(v3+n3*sz, v2+n2*sz, sz);
-      ++n2;
-    }
-  }
-
-  // scan for matches
-  for(n3=0;n3<N1+N2-1;++n3){
-    if(!compare(v3+n3*sz,v3+(n3+1)*sz)){
-      match(v3+n3*sz, v3+(n3+1)*sz);
-    }
-  }
-
-  /* copy result back to v1, v2 */
-  memcpy(v1, v3,       N1*sz);
-  memcpy(v2, v3+sz*N1, N2*sz);
-}
-
-// assumes N is even and the same on all ranks
-void parallelSort(int size, int rank, MPI_Comm comm,
-		  int N, void *vv, size_t sz,
-		  int (*compare)(const void *, const void *),
-		  void (*match)(void *, void *)
-		  ){
-
-#if 0
-  int rank, size;
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &size);
-#endif
-
-  /* cast void * to char * */
-  char *v = (char*) vv;
-
-  /* sort faces by their vertex number pairs */
-  qsort(v, N, sz, compare);
-
-  /* now do progressive merges */
-  int NA=N/2, NB = N/2, NC = N/2;
-
-  MPI_Request recvA, recvC;
-  MPI_Request sendA, sendC;
-  MPI_Status status;
-  int tag = 999;
-
-  /* temporary buffer for incoming data */
-  void *A = (void*) calloc(NA, sz);
-  void *B = v;
-  void *C = v+NB*sz;
-
-  /* temporary space for merge sort */
-  void *tmp = (void*) calloc(N, sz);
-
-  /* max and min elements out of place hop one process at each step */
-  for(int step=0;step<size;++step){
-
-    /* send C, receive A */
-    if(rank<size-1)
-      MPI_Isend(C, NC*sz, MPI_CHAR,  rank+1, tag, comm, &sendC);
-    if(rank>0)
-      MPI_Irecv(A, NA*sz, MPI_CHAR,  rank-1, tag, comm, &recvA);
-
-    if(rank<size-1)
-      MPI_Wait(&sendC, &status);
-    if(rank>0)
-      MPI_Wait(&recvA, &status);
-
-    /* merge sort A & B */
-    if(rank>0)
-      mergeLists(sz, NA, (char*)A, NB, (char*)B, (char*)tmp, compare, match);
-
-    /* send A, receive C */
-    if(rank>0)
-      MPI_Isend(A, NA*sz, MPI_CHAR, rank-1, tag, comm, &sendA);
-    if(rank<size-1)
-      MPI_Irecv(C, NC*sz, MPI_CHAR, rank+1, tag, comm, &recvC);
-
-    if(rank>0)
-      MPI_Wait(&sendA, &status);
-    if(rank<size-1)
-      MPI_Wait(&recvC, &status);
-
-    /* merge sort B & C */
-    mergeLists(sz, NB, (char*)B, NC, (char*)C, (char*)tmp, compare, match);
-
-  }
-
-  free(tmp);
-  free(A);
-}
diff --git a/libs/core/platformBuildKernel.cpp b/libs/core/platformBuildKernel.cpp
index d7d84d3be..fdc5d3df4 100644
--- a/libs/core/platformBuildKernel.cpp
+++ b/libs/core/platformBuildKernel.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,22 +26,29 @@ SOFTWARE.
 
 #include "platform.hpp"
 
-occa::kernel platform_t::buildKernel(std::string fileName, std::string kernelName,
-                                     occa::properties& kernelInfo){
+namespace libp {
 
-  occa::kernel kernel;
+kernel_t platform_t::buildKernel(std::string fileName,
+                                 std::string kernelName,
+                                 properties_t& kernelInfo){
+
+  assertInitialized();
+
+  kernel_t kernel;
 
   //build on root first
-  if (!rank)
+  if (!rank())
     kernel = device.buildKernel(fileName, kernelName, kernelInfo);
 
-  MPI_Barrier(comm);
+  comm.Barrier();
 
   //remaining ranks find the cached version (ideally)
-  if (rank)
+  if (rank())
     kernel = device.buildKernel(fileName, kernelName, kernelInfo);
 
-  MPI_Barrier(comm);
+  comm.Barrier();
 
   return kernel;
 }
+
+} //namespace libp
diff --git a/libs/core/platformDeviceConfig.cpp b/libs/core/platformDeviceConfig.cpp
index 989f017a4..b62d13bc3 100644
--- a/libs/core/platformDeviceConfig.cpp
+++ b/libs/core/platformDeviceConfig.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,30 +25,51 @@ SOFTWARE.
 */
 
 #include "platform.hpp"
-// #include "omp.h"
+#include "omp.h"
+
+namespace libp {
 
 // OCCA build stuff
 void platform_t::DeviceConfig(){
 
+  //find out how many ranks and devices are on this system
+  memory<char> hostnames(size()*MAX_PROCESSOR_NAME);
+  memory<char> hostname = hostnames + rank()*MAX_PROCESSOR_NAME;
+
+  int namelen;
+  Comm::GetProcessorName(hostname.ptr(), namelen);
+  comm.Allgather(hostnames, MAX_PROCESSOR_NAME);
+
+  int localRank = 0;
+  int localSize = 0;
+  for (int n=0; n<rank(); n++){
+    if (!strcmp(hostname.ptr(), hostnames.ptr()+n*MAX_PROCESSOR_NAME)) localRank++;
+  }
+  for (int n=0; n<size(); n++){
+    if (!strcmp(hostname.ptr(), hostnames.ptr()+n*MAX_PROCESSOR_NAME)) localSize++;
+  }
+
   int plat=0;
   int device_id=0;
 
-  if(settings.compareSetting("THREAD MODEL", "OpenCL"))
-    settings.getSetting("PLATFORM NUMBER", plat);
+  settings_t& Settings = settings();
 
-  // read thread model/device/platform from settings
+  if(Settings.compareSetting("THREAD MODEL", "OpenCL"))
+    Settings.getSetting("PLATFORM NUMBER", plat);
+
+  // read thread model/device/platform from Settings
   std::string mode;
 
-  if(settings.compareSetting("THREAD MODEL", "CUDA")){
+  if(Settings.compareSetting("THREAD MODEL", "CUDA")){
     mode = "{mode: 'CUDA'}";
   }
-  else if(settings.compareSetting("THREAD MODEL", "HIP")){
+  else if(Settings.compareSetting("THREAD MODEL", "HIP")){
     mode = "{mode: 'HIP'}";
   }
-  else if(settings.compareSetting("THREAD MODEL", "OpenCL")){
+  else if(Settings.compareSetting("THREAD MODEL", "OpenCL")){
     mode = "{mode: 'OpenCL', platform_id : " + std::to_string(plat) +"}";
   }
-  else if(settings.compareSetting("THREAD MODEL", "OpenMP")){
+  else if(Settings.compareSetting("THREAD MODEL", "OpenMP")){
     mode = "{mode: 'OpenMP'}";
   }
   else{
@@ -56,44 +77,22 @@ void platform_t::DeviceConfig(){
   }
 
   //add a device_id number for some modes
-  if (  settings.compareSetting("THREAD MODEL", "CUDA")
-      ||settings.compareSetting("THREAD MODEL", "HIP")
-      ||settings.compareSetting("THREAD MODEL", "OpenCL")) {
+  if (  Settings.compareSetting("THREAD MODEL", "CUDA")
+      ||Settings.compareSetting("THREAD MODEL", "HIP")
+      ||Settings.compareSetting("THREAD MODEL", "OpenCL")) {
     //for testing a single device, run with 1 rank and specify DEVICE NUMBER
-    if (size==1) {
-      settings.getSetting("DEVICE NUMBER",device_id);
+    if (size()==1) {
+      Settings.getSetting("DEVICE NUMBER",device_id);
     } else {
-      //find out how many ranks and devices are on this system
-      char* hostnames = (char *) ::malloc(size*sizeof(char)*MPI_MAX_PROCESSOR_NAME);
-      char* hostname = hostnames+rank*MPI_MAX_PROCESSOR_NAME;
-
-      int namelen;
-      MPI_Get_processor_name(hostname,&namelen);
-
-      MPI_Allgather(MPI_IN_PLACE , MPI_MAX_PROCESSOR_NAME, MPI_CHAR,
-                    hostnames, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, MPI_COMM_WORLD);
-
-      int localRank = 0;
-      int localSize = 0;
-      for (int n=0; n<rank; n++){
-        if (!strcmp(hostname, hostnames+n*MPI_MAX_PROCESSOR_NAME)) localRank++;
-      }
-      for (int n=0; n<size; n++){
-        if (!strcmp(hostname, hostnames+n*MPI_MAX_PROCESSOR_NAME)) localSize++;
-      }
 
       device_id = localRank;
 
       //check for over-subscribing devices
-      int deviceCount = occa::getDeviceCount(mode);
+      int deviceCount = getDeviceCount(mode);
       if (deviceCount>0 && localRank>=deviceCount) {
-        stringstream ss;
-        ss << "Rank " << rank << " oversubscribing device " << device_id%deviceCount << " on node \"" << hostname<< "\"";
-        LIBP_WARNING(ss.str());
+        LIBP_FORCE_WARNING("Rank " << rank() << " oversubscribing device " << device_id%deviceCount << " on node \"" << hostname.ptr() << "\"");
         device_id = device_id%deviceCount;
       }
-      MPI_Barrier(MPI_COMM_WORLD);
-      free(hostnames);
     }
 
     // add device_id to setup string
@@ -101,18 +100,82 @@ void platform_t::DeviceConfig(){
     mode += ", device_id: " + std::to_string(device_id) + "}";
   }
 
-  //set number of omp threads to use
-  //int Ncores = sysconf(_SC_NPROCESSORS_ONLN);
-  //int Nthreads = Ncores/localSize;
-  // Nthreads = mymax(1,Nthreads/2);
-  // omp_set_num_threads(Nthreads);
+#if !defined(LIBP_DEBUG)
+  /*set number of omp threads to use*/
+  /*Use lscpu to determine core and socket counts */
+  FILE *pipeCores   = popen("lscpu | grep \"Core(s) per socket\" | awk '{print $4}'", "r");
+  FILE *pipeSockets = popen("lscpu | grep \"Socket(s)\" | awk '{print $2}'", "r");
+  LIBP_ABORT("popen() failed!",
+             !pipeCores || !pipeSockets);
+
+  std::array<char, 128> buffer;
+  //read to end of line
+  LIBP_ABORT("Error reading core count",
+             !fgets(buffer.data(), buffer.size(), pipeCores));
+  int Ncores = std::stoi(buffer.data());
+
+  //read to end of line
+  LIBP_ABORT("Error reading core count",
+             !fgets(buffer.data(), buffer.size(), pipeSockets));
+  int Nsockets = std::stoi(buffer.data());
+
+  pclose(pipeCores);
+  pclose(pipeSockets);
+
+  // int Ncores = omp_get_num_procs();
+  int NcoresPerNode = Ncores*Nsockets;
+  int Nthreads=0;
+
+  /*Check OMP_NUM_THREADS env variable*/
+  std::string ompNumThreads;
+  char * ompEnvVar = std::getenv("OMP_NUM_THREADS");
+  if (ompEnvVar == nullptr) { // Environment variable is not set
+    Nthreads = std::max(NcoresPerNode/localSize, 1); //Evenly divide number of cores
+
+    // If omp max threads is lower than this (due to binding), go with omp
+    Nthreads = std::min(Nthreads, omp_get_max_threads());
+  } else {
+    ompNumThreads = ompEnvVar;
+    // Environmet variable is set, but could be empty string
+    if (ompNumThreads.size() == 0) {
+      // Environment variable is set but equal to empty string
+      Nthreads = std::max(NcoresPerNode/localSize, 1); //Evenly divide number of cores;
+
+      // If omp max threads is lower than this (due to binding), go with omp
+      Nthreads = std::min(Nthreads, omp_get_max_threads());
+    } else {
+      Nthreads = std::stoi(ompNumThreads);
+    }
+  }
+  LIBP_WARNING("Rank " << rank() << " oversubscribing CPU on node \"" << hostname.ptr() << "\"",
+               Nthreads*localSize>NcoresPerNode);
+  omp_set_num_threads(Nthreads);
+  // omp_set_num_threads(1);
 
-  // if (settings.compareSetting("VERBOSE","TRUE"))
-  //   printf("Rank %d: Ncores = %d, Nthreads = %d, device_id = %d \n", rank, Ncores, Nthreads, device_id);
+  // printf("Rank %d: Nsockets = %d, NcoresPerSocket = %d, Nthreads = %d, device_id = %d \n",
+  //        rank(), Nsockets, Ncores, Nthreads, device_id);
+#endif
 
   device.setup(mode);
 
-  std::string occaCacheDir = LIBP_DIR "/.occa";
-  settings.getSetting("CACHE DIR", occaCacheDir);
-  occa::env::setOccaCacheDir(occaCacheDir);
+  std::string cacheDir;
+  char * cacheEnvVar = std::getenv("LIBP_CACHE_DIR");
+  if (cacheEnvVar == nullptr) {
+    // Environment variable is not set
+    cacheDir = LIBP_DIR "/.occa";
+  }
+  else {
+    // Environmet variable is set, but could be empty string
+    cacheDir = cacheEnvVar;
+
+    if (cacheDir.size() == 0) {
+      // Environment variable is set but equal to empty string
+      cacheDir = LIBP_DIR "/.occa";
+    }
+  }
+  setCacheDir(cacheDir);
+
+  comm.Barrier();
 }
+
+} //namespace libp
diff --git a/libs/core/platformProperties.cpp b/libs/core/platformProperties.cpp
index 9f603bf9d..451586afa 100644
--- a/libs/core/platformProperties.cpp
+++ b/libs/core/platformProperties.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,79 +26,89 @@ SOFTWARE.
 
 #include "platform.hpp"
 
+namespace libp {
+
 //initialize occa::properties with common props
 void platform_t::DeviceProperties(){
 
-  props["defines"].asObject();
-  props["includes"].asArray();
-  props["header"].asArray();
-  props["flags"].asObject();
+  properties_t& Props = props();
+
+  Props["defines"].asObject();
+  Props["includes"].asArray();
+  Props["header"].asArray();
+  Props["flags"].asObject();
 
-  props["device"].asObject();
-  props["kernel"].asObject();
-  props["memory"].asObject();
+  Props["device"].asObject();
+  Props["kernel"].asObject();
+  Props["memory"].asObject();
 
   if(sizeof(dfloat)==4){
-    props["defines/" "dfloat"]="float";
-    props["defines/" "dfloat2"]="float2";
-    props["defines/" "dfloat4"]="float4";
-    props["defines/" "dfloat8"]="float8";
+    Props["defines/" "dfloat"]="float";
+    Props["defines/" "dfloat2"]="float2";
+    Props["defines/" "dfloat4"]="float4";
+    Props["defines/" "dfloat8"]="float8";
   }
   if(sizeof(dfloat)==8){
-    props["defines/" "dfloat"]="double";
-    props["defines/" "dfloat2"]="double2";
-    props["defines/" "dfloat4"]="double4";
-    props["defines/" "dfloat8"]="double8";
+    Props["defines/" "dfloat"]="double";
+    Props["defines/" "dfloat2"]="double2";
+    Props["defines/" "dfloat4"]="double4";
+    Props["defines/" "dfloat8"]="double8";
   }
 
   if(sizeof(pfloat)==4){
-    props["defines/" "pfloat"]="float";
-    props["defines/" "pfloat2"]="float2";
-    props["defines/" "pfloat4"]="float4";
-    props["defines/" "pfloat8"]="float8";
+    Props["defines/" "pfloat"]="float";
+    Props["defines/" "pfloat2"]="float2";
+    Props["defines/" "pfloat4"]="float4";
+    Props["defines/" "pfloat8"]="float8";
   }
   if(sizeof(pfloat)==8){
-    props["defines/" "pfloat"]="double";
-    props["defines/" "pfloat2"]="double2";
-    props["defines/" "pfloat4"]="double4";
-    props["defines/" "pfloat8"]="double8";
+    Props["defines/" "pfloat"]="double";
+    Props["defines/" "pfloat2"]="double2";
+    Props["defines/" "pfloat4"]="double4";
+    Props["defines/" "pfloat8"]="double8";
   }
 
   
   if(sizeof(dlong)==4){
-    props["defines/" "dlong"]="int";
+    Props["defines/" "dlong"]="int";
   }
   if(sizeof(dlong)==8){
-    props["defines/" "dlong"]="long long int";
+    Props["defines/" "dlong"]="long long int";
   }
 
   if(device.mode()=="Serial") {
-    props["compiler_flags"] += "-O3 ";
-    props["compiler_flags"] += "-g "; //debugging
+    Props["compiler_flags"] += "-O3 ";
+    Props["compiler_flags"] += "-g "; //debugging
+    Props["defines/OCCA_USE_SERIAL"] = 1;
   }
 
   if(device.mode()=="CUDA"){ // add backend compiler optimization for CUDA
-    props["compiler_flags"] += "--ftz=true ";
-    props["compiler_flags"] += "--prec-div=false ";
-    props["compiler_flags"] += "--prec-sqrt=false ";
-    props["compiler_flags"] += "--use_fast_math ";
-    props["compiler_flags"] += "--fmad=true "; // compiler option for cuda
-    props["compiler_flags"] += "-Xptxas -dlcm=ca";
+    Props["compiler_flags"] += "--ftz=true ";
+    Props["compiler_flags"] += "--prec-div=false ";
+    Props["compiler_flags"] += "--prec-sqrt=false ";
+    Props["compiler_flags"] += "--use_fast_math ";
+    Props["compiler_flags"] += "--fmad=true "; // compiler option for cuda
+    Props["compiler_flags"] += "-Xptxas -dlcm=ca";
+    Props["defines/OCCA_USE_CUDA"] = 1;
   }
 
   if(device.mode()=="OpenCL"){ // add backend compiler optimization for OPENCL
-    props["compiler_flags"] += " -cl-std=CL2.0 ";
-    props["compiler_flags"] += " -cl-strict-aliasing ";
-    props["compiler_flags"] += " -cl-mad-enable ";
-    props["compiler_flags"] += " -cl-no-signed-zeros ";
-    props["compiler_flags"] += " -cl-unsafe-math-optimizations ";
-    props["compiler_flags"] += " -cl-fast-relaxed-math ";
+    Props["compiler_flags"] += " -cl-std=CL2.0 ";
+    Props["compiler_flags"] += " -cl-strict-aliasing ";
+    Props["compiler_flags"] += " -cl-mad-enable ";
+    Props["compiler_flags"] += " -cl-no-signed-zeros ";
+    Props["compiler_flags"] += " -cl-unsafe-math-optimizations ";
+    Props["compiler_flags"] += " -cl-fast-relaxed-math ";
+    Props["defines/OCCA_USE_OPENCL"] = 1;
   }
 
   if(device.mode()=="HIP"){ // add backend compiler optimization for HIP
-    props["compiler_flags"] += " -O3 ";
-    props["compiler_flags"] += " -ffp-contract=fast ";
-    // props["compiler_flags"] += " -funsafe-math-optimizations ";
-    // props["compiler_flags"] += " -ffast-math ";
+    Props["compiler_flags"] += " -O3 ";
+    Props["compiler_flags"] += " -ffp-contract=fast ";
+    Props["compiler_flags"] += " -funsafe-math-optimizations ";
+    Props["compiler_flags"] += " -ffast-math ";
+    Props["defines/OCCA_USE_HIP"] = 1;
   }
 }
+
+} //namespace libp
diff --git a/libs/core/platformSettings.cpp b/libs/core/platformSettings.cpp
index ae59f180b..ba506ed61 100644
--- a/libs/core/platformSettings.cpp
+++ b/libs/core/platformSettings.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,7 +26,9 @@ SOFTWARE.
 
 #include "platform.hpp"
 
-platformSettings_t::platformSettings_t(MPI_Comm _comm):
+namespace libp {
+
+platformSettings_t::platformSettings_t(comm_t _comm):
   settings_t(_comm) {
 
   //settings format
@@ -55,10 +57,7 @@ platformSettings_t::platformSettings_t(MPI_Comm _comm):
 
 void platformSettings_t::report() {
 
-  int rank;
-  MPI_Comm_rank(comm, &rank);
-
-  if (rank==0) {
+  if (comm.rank()==0) {
     std::cout << "OCCA Settings:\n\n";
 
     reportSetting("THREAD MODEL");
@@ -66,12 +65,12 @@ void platformSettings_t::report() {
     if (compareSetting("THREAD MODEL","OpenCL"))
       reportSetting("PLATFORM NUMBER");
 
-    int size;
-    MPI_Comm_size(comm, &size);
-    if ((size==1)
+    if ((comm.size()==1)
         &&(compareSetting("THREAD MODEL","CUDA")
         ||compareSetting("THREAD MODEL","HIP")
         ||compareSetting("THREAD MODEL","OpenCL") ))
       reportSetting("DEVICE NUMBER");
   }
-}
\ No newline at end of file
+}
+
+} //namespace libp
diff --git a/libs/core/rankDecomp.cpp b/libs/core/rankDecomp.cpp
new file mode 100644
index 000000000..6353abd21
--- /dev/null
+++ b/libs/core/rankDecomp.cpp
@@ -0,0 +1,215 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "core.hpp"
+
+namespace libp {
+
+// find a factorization n = nx*ny such that
+//  nx>=ny are 'close' to one another
+void Factor2(const int n, int &nx, int &ny) {
+  //start with guessing nx ~= n^1/2
+  nx = round(sqrt(n));
+  ny = 1;
+
+  for (;nx<n;nx++) {
+    if (n % nx ==0) { //if nx divides n
+      ny = n / nx; //divide out nx
+
+      //swap if needed
+      if (ny>nx) std::swap(nx,ny);
+
+      return;
+    }
+  }
+
+  //if we made it this far, n is prime
+  nx = n;
+}
+
+// find a factorization n = nx*ny*nz such that
+//  nx>=ny>=nz are all 'close' to one another
+void Factor3(const int n, int &nx, int &ny, int &nz) {
+  //start with guessing nx ~= n^1/3
+  nx = round(std::cbrt(n));
+  ny = nz = 1;
+
+  for (;nx<n;nx++) {
+    if (n % nx ==0) { //if nx divides n
+      const int f = n / nx; //divide out nx
+
+      ny = round(sqrt(f)); //guess ny ~= sqrt(f)
+      for (;ny<f;ny++) {
+        if (f % ny == 0) { //if ny divides f
+          nz = f/ny; //divide out ny
+
+          //sort
+          if (ny>nx) std::swap(nx,ny);
+          if (nz>ny) std::swap(ny,nz);
+          if (ny>nx) std::swap(nx,ny);
+
+          return;
+        }
+      }
+
+      //if we're here, f is prime
+      ny = f;
+      nz = 1;
+
+      //swap if needed
+      if (ny>nx) std::swap(nx,ny);
+
+      return;
+    }
+  }
+
+  //if we made it this far, n is prime
+  nx = n;
+}
+
+// A function to find largest prime factor
+static int maxPrimeFactor(int n) {
+  int p = -1;
+
+  // Print the number of 2s that divide n
+  while (n % 2 == 0) {
+    p = 2;
+    n >>= 1; // equivalent to n /= 2
+  }
+  // n must be odd at this point
+  while (n % 3 == 0) {
+    p = 3;
+    n=n/3;
+  }
+
+  // now we have to iterate only for integers
+  // who does not have prime factor 2 and 3
+  for (int i = 5; i <= sqrt(n); i += 6) {
+    while (n % i == 0) {
+      p = i;
+      n = n / i;
+    }
+    while (n % (i+2) == 0) {
+      p = i+2;
+      n = n / (i+2);
+    }
+  }
+
+  // This condition is to handle the case
+  // when n is a prime number greater than 4
+  if (n > 4) p = n;
+
+  return p;
+}
+
+/*Determine the (x,y) coordinates in MPI grid for this process rank*/
+void RankDecomp2(int  size_x, int  size_y,
+                 int &rank_x, int &rank_y,
+                 const int rank) {
+
+  int size = size_x*size_y;
+
+  if (size==1) {
+    rank_x=0;
+    rank_y=0;
+    return;
+  }
+
+  /*Determine coordinates via recursive factorization*/
+  if (size_y>=size_x) { //size_y is largest
+    const int p = maxPrimeFactor(size_y);
+    const int csize = size/p;
+    const int crank = rank%csize;
+
+    /*Recursive call*/
+    int crank_y=-1;
+    RankDecomp2(size_x, size_y/p,
+                rank_x, crank_y, crank);
+    rank_y = crank_y + (rank/csize)*(size_y/p);
+  } else { //size_x is largest
+    const int p = maxPrimeFactor(size_x);
+    const int csize = size/p;
+    const int crank = rank%csize;
+
+    /*Recursive call*/
+    int crank_x=-1;
+    RankDecomp2(size_x/p, size_y,
+                crank_x, rank_y, crank);
+    rank_x = crank_x + (rank/csize)*(size_x/p);
+  }
+}
+
+/*Determine the (x,y,z) coordinates in MPI grid for this process rank*/
+void RankDecomp3(int  size_x, int  size_y, int  size_z,
+                 int &rank_x, int &rank_y, int &rank_z,
+                 const int rank) {
+
+  int size = size_x*size_y*size_z;
+
+  if (size==1) {
+    rank_x=0;
+    rank_y=0;
+    rank_z=0;
+    return;
+  }
+
+  /*Determine coordinates via recursive factorization*/
+  if (size_z>=size_x && size_z>=size_y) { //size_z is largest
+
+    const int p = maxPrimeFactor(size_z);
+    const int csize = size/p;
+    const int crank = rank%csize;
+
+    /*Recursive call*/
+    int crank_z=-1;
+    RankDecomp3(size_x, size_y, size_z/p,
+                rank_x, rank_y, crank_z, crank);
+    rank_z = crank_z + (rank/csize)*(size_z/p);
+
+  } else if (size_y>=size_x && size_y>=size_z) { //size_y is largest
+    const int p = maxPrimeFactor(size_y);
+    const int csize = size/p;
+    const int crank = rank%csize;
+
+    /*Recursive call*/
+    int crank_y=-1;
+    RankDecomp3(size_x, size_y/p, size_z,
+                rank_x, crank_y, rank_z, crank);
+    rank_y = crank_y + (rank/csize)*(size_y/p);
+  } else { //size_x is largest
+    const int p = maxPrimeFactor(size_x);
+    const int csize = size/p;
+    const int crank = rank%csize;
+
+    /*Recursive call*/
+    int crank_x=-1;
+    RankDecomp3(size_x/p, size_y, size_z,
+                crank_x, rank_y, rank_z, crank);
+    rank_x = crank_x + (rank/csize)*(size_x/p);
+  }
+}
+
+} //namespace libp
diff --git a/libs/core/settings.cpp b/libs/core/settings.cpp
index 321bb8d82..202de2856 100644
--- a/libs/core/settings.cpp
+++ b/libs/core/settings.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,8 +26,17 @@ SOFTWARE.
 
 #include "settings.hpp"
 
-setting_t::setting_t(string name_, string val_, string description_, vector<string> options_)
-  : name{name_}, val{val_}, description{description_}, options{options_} {}
+namespace libp {
+
+using std::vector;
+using std::string;
+
+setting_t::setting_t(string name_, string val_,
+                     string description_, vector<string> options_):
+  name{name_},
+  val{val_},
+  description{description_},
+  options{options_} {}
 
 const string& setting_t::getName() const {
   return name;
@@ -57,7 +66,7 @@ void setting_t::updateVal(const string newVal){
        << "Possible values are: { ";
     for (size_t i=0;i<options.size()-1;i++) ss << options[i] << ", ";
     ss << options[options.size()-1] << " }" << std::endl;
-    LIBP_ABORT(ss.str());
+    LIBP_FORCE_ABORT(ss.str());
   }
 }
 
@@ -83,12 +92,12 @@ string setting_t::toString() const {
   return ss.str();
 }
 
-std::ostream& operator<<(ostream& os, const setting_t& setting) {
+std::ostream& operator<<(std::ostream& os, const setting_t& setting) {
   os << setting.toString();
   return os;
 }
 
-settings_t::settings_t(MPI_Comm _comm):
+settings_t::settings_t(comm_t _comm):
   comm(_comm) {}
 
 void settings_t::newSetting(const string name, const string val,
@@ -96,13 +105,10 @@ void settings_t::newSetting(const string name, const string val,
                             const vector<string> options) {
   auto search = settings.find(name);
   if (search == settings.end()) {
-    setting_t *S = new setting_t(name, val, description, options);
-    settings[name] = S;
+    settings[name] = setting_t(name, val, description, options);
     insertOrder.push_back(name);
   } else {
-    stringstream ss;
-    ss << "Setting with name: [" << name << "] already exists.";
-    LIBP_ABORT(ss.str());
+    LIBP_FORCE_ABORT("Setting with name: [" << name << "] already exists.");
   }
 }
 
@@ -117,12 +123,10 @@ bool settings_t::hasSetting(const string name) {
 void settings_t::changeSetting(const string name, const string newVal) {
   auto search = settings.find(name);
   if (search != settings.end()) {
-    setting_t* val = search->second;
-    val->updateVal(newVal);
+    setting_t& val = search->second;
+    val.updateVal(newVal);
   } else {
-    stringstream ss;
-    ss << "Setting with name: [" << name << "] does not exist.";
-    LIBP_ABORT(ss.str());
+    LIBP_FORCE_ABORT("Setting with name: [" << name << "] does not exist.");
   }
 }
 
@@ -131,17 +135,13 @@ void settings_t::readSettingsFromFile(string filename) {
   string line;
   std::ifstream file;
 
-  int rank;
-  MPI_Comm_rank(comm, &rank);
+  int rank = comm.rank();
 
   //only the root rank performs the read
   if (!rank) {
     file.open(filename);
-    if (!file.is_open()) {
-      stringstream ss;
-      ss << "Failed to open: " << filename.c_str();
-      LIBP_ABORT(ss.str());
-    }
+    LIBP_ABORT("Failed to open: " << filename.c_str(),
+               !file.is_open());
   }
 
   string name = "";
@@ -151,23 +151,26 @@ void settings_t::readSettingsFromFile(string filename) {
   int flag;
 
   if (!rank)
-   flag = (getline(file,line)) ? 1 : 0;
+    flag = (getline(file,line)) ? 1 : 0;
+
+  comm.Bcast(flag, 0);
 
-  MPI_Bcast(&flag, 1, MPI_INT, 0, comm);
+  int MaxLineSize=512;
+  memory<char> cline;
+  cline.calloc(MaxLineSize+1);
 
   while (flag) {
     int size;
-    char *cline;
 
     if (!rank) {
       size = line.length();
+      LIBP_ABORT("Line in settings file is too long: " << line,
+                 size>MaxLineSize);
     }
-    MPI_Bcast(&size, 1, MPI_INT, 0, comm);
+    comm.Bcast(size, 0);
 
-    cline = (char*) calloc(size+1,sizeof(char));
-    if (!rank) strcpy(cline, line.c_str());
-
-    MPI_Bcast(cline, size, MPI_CHAR, 0, comm);
+    if (!rank) strcpy(cline.ptr(), line.c_str());
+    comm.Bcast(cline, 0, size);
 
     for(int i=0; i<size; i++){
       char c = cline[i];
@@ -194,7 +197,6 @@ void settings_t::readSettingsFromFile(string filename) {
         val += c;
       }
     }
-    if (cline) free(cline);
 
     if (name.length() && val.length()) {
       newSetting(name, val);
@@ -204,7 +206,7 @@ void settings_t::readSettingsFromFile(string filename) {
     if (!rank)
       flag = (getline(file,line)) ? 1 : 0;
 
-    MPI_Bcast(&flag, 1, MPI_INT, 0, comm);
+    comm.Bcast(flag, 0);
   }
 
   if (!rank)
@@ -214,12 +216,10 @@ void settings_t::readSettingsFromFile(string filename) {
 string settings_t::getSetting(const string name) const {
   auto search = settings.find(name);
   if (search != settings.end()) {
-    setting_t* val = search->second;
-    return val->getVal<string>();
+    const setting_t& val = search->second;
+    return val.getVal<string>();
   } else {
-    stringstream ss;
-    ss << "Unable to find setting: [" << name << "]";
-    LIBP_ABORT(ss.str());
+    LIBP_FORCE_ABORT("Unable to find setting: [" << name << "]");
     return string();
   }
 }
@@ -227,12 +227,10 @@ string settings_t::getSetting(const string name) const {
 bool settings_t::compareSetting(const string name, const string token) const {
   auto search = settings.find(name);
   if (search != settings.end()) {
-    setting_t* val = search->second;
-    return val->compareVal(token);
+    const setting_t& val = search->second;
+    return val.compareVal(token);
   } else {
-    stringstream ss;
-    ss << "Unable to find setting: [" << name.c_str() << "]";
-    LIBP_ABORT(ss.str());
+    LIBP_FORCE_ABORT("Unable to find setting: [" << name.c_str() << "]");
     return false;
   }
 }
@@ -241,24 +239,19 @@ void settings_t::report() {
   std::cout << "Settings:\n\n";
   for (size_t i = 0; i < insertOrder.size(); ++i) {
     const string &s = insertOrder[i];
-    setting_t* val = settings[s];
-    std::cout << *val << std::endl;
+    const setting_t& val = settings[s];
+    std::cout << val << std::endl;
   }
 }
 
 void settings_t::reportSetting(const string name) const {
   auto search = settings.find(name);
   if (search != settings.end()) {
-    setting_t* val = search->second;
-    std::cout << *val << std::endl;
+    const setting_t& val = search->second;
+    std::cout << val << std::endl;
   } else {
-    stringstream ss;
-    ss << "Unable to find setting: [" << name.c_str() << "]";
-    LIBP_ABORT(ss.str());
+    LIBP_FORCE_ABORT("Unable to find setting: [" << name.c_str() << "]");
   }
 }
 
-settings_t::~settings_t() {
-  for(auto it = settings.begin(); it != settings.end(); ++it)
-    delete it->second;
-}
+} //namespace libp
diff --git a/solvers/ins/src/insBoundarySetup.cpp b/libs/core/timer.cpp
similarity index 52%
rename from solvers/ins/src/insBoundarySetup.cpp
rename to libs/core/timer.cpp
index fb8a1e83c..382e5a479 100644
--- a/solvers/ins/src/insBoundarySetup.cpp
+++ b/libs/core/timer.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -24,32 +24,44 @@ SOFTWARE.
 
 */
 
-#include "ins.hpp"
-
-void ins_t::BoundarySetup(){
-
-  //make a node-wise bc flag using the gsop (prioritize Dirichlet boundaries over Neumann)
-  mapB = (int *) calloc(mesh.Nelements*mesh.Np,sizeof(int));
-  const int largeNumber = 1<<20;
-  for (dlong e=0;e<mesh.Nelements;e++) {
-    for (int n=0;n<mesh.Np;n++) mapB[n+e*mesh.Np] = largeNumber;
-    for (int f=0;f<mesh.Nfaces;f++) {
-      int bc = mesh.EToB[f+e*mesh.Nfaces];
-      if (bc>0) {
-        for (int n=0;n<mesh.Nfp;n++) {
-          int fid = mesh.faceNodes[n+f*mesh.Nfp];
-          mapB[fid+e*mesh.Np] = mymin(bc,mapB[fid+e*mesh.Np]);
-        }
-      }
-    }
-  }
-  mesh.ogs->GatherScatter(mapB, ogs_int, ogs_min, ogs_sym);
-
-  for (dlong n=0;n<mesh.Nelements*mesh.Np;n++) {
-    if (mapB[n] == largeNumber) {//no boundary
-      mapB[n] = 0.;
-    }
-  }
-
-  o_mapB = platform.malloc(mesh.Nelements*mesh.Np*sizeof(int), mapB);
+#include "timer.hpp"
+
+namespace libp {
+
+/* Host time*/
+timePoint_t Time() {
+  return std::chrono::high_resolution_clock::now();
+}
+
+/* Host time after global sync*/
+timePoint_t GlobalTime(comm_t comm) {
+  comm.Barrier();
+  return Time();
 }
+
+/* Host time after platform sync*/
+timePoint_t PlatformTime(platform_t &platform) {
+  platform.finish();
+  return Time();
+}
+
+/* Host time after platform sync*/
+timePoint_t GlobalPlatformTime(platform_t &platform) {
+  platform.finish();
+  platform.comm.Barrier();
+  return Time();
+}
+
+/* Host time after platform sync*/
+timePoint_t GlobalPlatformTime(platform_t &platform, comm_t comm) {
+  platform.finish();
+  comm.Barrier();
+  return Time();
+}
+
+/*Time between time points, in seconds*/
+double ElapsedTime(const timePoint_t start, const timePoint_t end) {
+  return std::chrono::duration_cast<std::chrono::microseconds>(end-start).count()/(1.0e6);
+}
+
+} //namespace libp
diff --git a/libs/linAlg/linAlg.cpp b/libs/linAlg/linAlg.cpp
index 6776e8a73..0c0624bf4 100644
--- a/libs/linAlg/linAlg.cpp
+++ b/libs/linAlg/linAlg.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,225 +25,202 @@ SOFTWARE.
 */
 
 #include "linAlg.hpp"
+#include "platform.hpp"
+
+namespace libp {
 
 /*********************/
 /* vector operations */
 /*********************/
 
 // o_a[n] = alpha
-void linAlg_t::set(const dlong N, const dfloat alpha, occa::memory& o_a) {
+void linAlg_t::set(const dlong N, const dfloat alpha, deviceMemory<dfloat> o_a) {
   setKernel(N, alpha, o_a);
 }
 
 // o_a[n] += alpha
-void linAlg_t::add(const dlong N, const dfloat alpha, occa::memory& o_a) {
+void linAlg_t::add(const dlong N, const dfloat alpha, deviceMemory<dfloat> o_a) {
   addKernel(N, alpha, o_a);
 }
 
 // o_a[n] *= alpha
-void linAlg_t::scale(const dlong N, const dfloat alpha, occa::memory& o_a)  {
+void linAlg_t::scale(const dlong N, const dfloat alpha, deviceMemory<dfloat> o_a)  {
   scaleKernel(N, alpha, o_a);
 }
 
 // o_y[n] = beta*o_y[n] + alpha*o_x[n]
-void linAlg_t::axpy(const dlong N, const dfloat alpha, occa::memory& o_x,
-                    const dfloat beta,  occa::memory& o_y) {
+void linAlg_t::axpy(const dlong N, const dfloat alpha, deviceMemory<dfloat> o_x,
+                    const dfloat beta,  deviceMemory<dfloat> o_y) {
   axpyKernel(N, alpha, o_x, beta, o_y);
 }
 
 // o_z[n] = beta*o_y[n] + alpha*o_x[n]
-void linAlg_t::zaxpy(const dlong N, const dfloat alpha, occa::memory& o_x,
-                     const dfloat beta, occa::memory& o_y, occa::memory& o_z) {
+void linAlg_t::zaxpy(const dlong N, const dfloat alpha, deviceMemory<dfloat> o_x,
+                     const dfloat beta, deviceMemory<dfloat> o_y, deviceMemory<dfloat> o_z) {
   zaxpyKernel(N, alpha, o_x, beta, o_y, o_z);
 }
 
 // o_x[n] = alpha*o_a[n]*o_x[n]
 void linAlg_t::amx(const dlong N, const dfloat alpha,
-                   occa::memory& o_a, occa::memory& o_x) {
+                   deviceMemory<dfloat> o_a, deviceMemory<dfloat> o_x) {
   amxKernel(N, alpha, o_a, o_x);
 }
 
 // o_y[n] = alpha*o_a[n]*o_x[n] + beta*o_y[n]
 void linAlg_t::amxpy(const dlong N, const dfloat alpha,
-                     occa::memory& o_a, occa::memory& o_x,
-                     const dfloat beta, occa::memory& o_y) {
+                     deviceMemory<dfloat> o_a, deviceMemory<dfloat> o_x,
+                     const dfloat beta, deviceMemory<dfloat> o_y) {
   amxpyKernel(N, alpha, o_a, o_x, beta, o_y);
 }
 
 // o_z[n] = alpha*o_a[n]*o_x[n] + beta*o_y[n]
 void linAlg_t::zamxpy(const dlong N, const dfloat alpha,
-                      occa::memory& o_a, occa::memory& o_x,
-                      const dfloat beta, occa::memory& o_y, occa::memory& o_z) {
+                      deviceMemory<dfloat> o_a, deviceMemory<dfloat> o_x,
+                      const dfloat beta, deviceMemory<dfloat> o_y, deviceMemory<dfloat> o_z) {
   zamxpyKernel(N, alpha, o_a, o_x, beta, o_y, o_z);
 }
 
 // o_x[n] = alpha*o_x[n]/o_a[n]
 void linAlg_t::adx(const dlong N, const dfloat alpha,
-                   occa::memory& o_a, occa::memory& o_x) {
+                   deviceMemory<dfloat> o_a, deviceMemory<dfloat> o_x) {
   adxKernel(N, alpha, o_a, o_x);
 }
 
 // o_y[n] = alpha*o_x[n]/o_a[n] + beta*o_y[n]
 void linAlg_t::adxpy(const dlong N, const dfloat alpha,
-                     occa::memory& o_a, occa::memory& o_x,
-                     const dfloat beta, occa::memory& o_y) {
+                     deviceMemory<dfloat> o_a, deviceMemory<dfloat> o_x,
+                     const dfloat beta, deviceMemory<dfloat> o_y) {
   adxpyKernel(N, alpha, o_a, o_x, beta, o_y);
 }
 
 // o_z[n] = alpha*o_x[n]/o_a[n] + beta*o_y[n]
 void linAlg_t::zadxpy(const dlong N, const dfloat alpha,
-                      occa::memory& o_a, occa::memory& o_x,
-                      const dfloat beta, occa::memory& o_y, occa::memory& o_z) {
+                      deviceMemory<dfloat> o_a, deviceMemory<dfloat> o_x,
+                      const dfloat beta, deviceMemory<dfloat> o_y, deviceMemory<dfloat> o_z) {
   zadxpyKernel(N, alpha, o_a, o_x, beta, o_y, o_z);
 }
 
 // \min o_a
-dfloat linAlg_t::min(const dlong N, occa::memory& o_a, MPI_Comm comm) {
-  //TODO, maybe complete reduction on device with second kernel?
+dfloat linAlg_t::min(const dlong N, deviceMemory<dfloat> o_a, comm_t comm) {
   int Nblock = (N+blocksize-1)/blocksize;
   Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries
 
-  minKernel(Nblock, N, o_a, o_scratch);
-
-  o_scratch.copyTo(scratch, Nblock*sizeof(dfloat));
+  minKernel1(Nblock, N, o_a, o_scratch);
+  minKernel2(Nblock, o_scratch);
 
-  dfloat min = std::numeric_limits<dfloat>::max();
-  for(dlong n=0;n<Nblock;++n){
-    min = (scratch[n]<min) ? scratch[n] : min;
-  }
+  h_scratch.copyFrom(o_scratch, 1, 0, properties_t("async", true));
+  platform->finish();
 
-  dfloat globalmin = 0;
-  MPI_Allreduce(&min, &globalmin, 1, MPI_DFLOAT, MPI_MIN, comm);
+  dfloat globalmin = h_scratch[0];
+  comm.Allreduce(globalmin, Comm::Min);
 
   return globalmin;
 }
 
 // \max o_a
-dfloat linAlg_t::max(const dlong N, occa::memory& o_a, MPI_Comm comm) {
-  //TODO, maybe complete reduction on device with second kernel?
+dfloat linAlg_t::max(const dlong N, deviceMemory<dfloat> o_a, comm_t comm) {
   int Nblock = (N+blocksize-1)/blocksize;
   Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries
 
-  maxKernel(Nblock, N, o_a, o_scratch);
-
-  o_scratch.copyTo(scratch, Nblock*sizeof(dfloat));
+  maxKernel1(Nblock, N, o_a, o_scratch);
+  maxKernel2(Nblock, o_scratch);
 
-  dfloat max = -std::numeric_limits<dfloat>::max();
-  for(dlong n=0;n<Nblock;++n){
-    max = (scratch[n]>max) ? scratch[n] : max;
-  }
+  h_scratch.copyFrom(o_scratch, 1, 0, properties_t("async", true));
+  platform->finish();
 
-  dfloat globalmax = 0;
-  MPI_Allreduce(&max, &globalmax, 1, MPI_DFLOAT, MPI_MAX, comm);
+  dfloat globalmax = h_scratch[0];
+  comm.Allreduce(globalmax, Comm::Max);
 
   return globalmax;
 }
 
 // \sum o_a
-dfloat linAlg_t::sum(const dlong N, occa::memory& o_a, MPI_Comm comm) {
-  //TODO, maybe complete reduction on device with second kernel?
+dfloat linAlg_t::sum(const dlong N, deviceMemory<dfloat> o_a, comm_t comm) {
   int Nblock = (N+blocksize-1)/blocksize;
   Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries
 
-  sumKernel(Nblock, N, o_a, o_scratch);
+  sumKernel1(Nblock, N, o_a, o_scratch);
+  sumKernel2(Nblock, o_scratch);
 
-  o_scratch.copyTo(scratch, Nblock*sizeof(dfloat));
+  h_scratch.copyFrom(o_scratch, 1, 0, properties_t("async", true));
+  platform->finish();
 
-  dfloat sum = 0;
-  for(dlong n=0;n<Nblock;++n){
-    sum += scratch[n];
-  }
-
-  dfloat globalsum = 0;
-  MPI_Allreduce(&sum, &globalsum, 1, MPI_DFLOAT, MPI_SUM, comm);
+  dfloat globalsum = h_scratch[0];
+  comm.Allreduce(globalsum, Comm::Sum);
 
   return globalsum;
 }
 
 // ||o_a||_2
-dfloat linAlg_t::norm2(const dlong N, occa::memory& o_a, MPI_Comm comm) {
-  //TODO, maybe complete reduction on device with second kernel?
+dfloat linAlg_t::norm2(const dlong N, deviceMemory<dfloat> o_a, comm_t comm) {
   int Nblock = (N+blocksize-1)/blocksize;
   Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries
 
-  norm2Kernel(Nblock, N, o_a, o_scratch);
-
-  o_scratch.copyTo(scratch, Nblock*sizeof(dfloat));
+  norm2Kernel1(Nblock, N, o_a, o_scratch);
+  norm2Kernel2(Nblock, o_scratch);
 
-  dfloat norm = 0;
-  for(dlong n=0;n<Nblock;++n){
-    norm += scratch[n];
-  }
+  h_scratch.copyFrom(o_scratch, 1, 0, properties_t("async", true));
+  platform->finish();
 
-  dfloat globalnorm = 0;
-  MPI_Allreduce(&norm, &globalnorm, 1, MPI_DFLOAT, MPI_SUM, comm);
+  dfloat globalnorm = h_scratch[0];
+  comm.Allreduce(globalnorm, Comm::Sum);
 
   return sqrt(globalnorm);
 }
 
 // o_x.o_y
-dfloat linAlg_t::innerProd(const dlong N, occa::memory& o_x, occa::memory& o_y,
-                           MPI_Comm comm) {
-  //TODO, maybe complete reduction on device with second kernel?
+dfloat linAlg_t::innerProd(const dlong N, deviceMemory<dfloat> o_x, deviceMemory<dfloat> o_y,
+                           comm_t comm) {
   int Nblock = (N+blocksize-1)/blocksize;
   Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries
 
-  innerProdKernel(Nblock, N, o_x, o_y, o_scratch);
+  innerProdKernel1(Nblock, N, o_x, o_y, o_scratch);
+  innerProdKernel2(Nblock, o_scratch);
 
-  o_scratch.copyTo(scratch, Nblock*sizeof(dfloat));
+  h_scratch.copyFrom(o_scratch, 1, 0, properties_t("async", true));
+  platform->finish();
 
-  dfloat dot = 0;
-  for(dlong n=0;n<Nblock;++n){
-    dot += scratch[n];
-  }
-
-  dfloat globaldot = 0;
-  MPI_Allreduce(&dot, &globaldot, 1, MPI_DFLOAT, MPI_SUM, comm);
+  dfloat globaldot = h_scratch[0];
+  comm.Allreduce(globaldot, Comm::Sum);
 
   return globaldot;
 }
 
 // o_w.o_x.o_y
-dfloat linAlg_t::weightedInnerProd(const dlong N, occa::memory& o_w,
-                                   occa::memory& o_x, occa::memory& o_y,
-                                   MPI_Comm comm) {
-  //TODO, maybe complete reduction on device with second kernel?
+dfloat linAlg_t::weightedInnerProd(const dlong N, deviceMemory<dfloat> o_w,
+                                   deviceMemory<dfloat> o_x, deviceMemory<dfloat> o_y,
+                                   comm_t comm) {
   int Nblock = (N+blocksize-1)/blocksize;
   Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries
 
-  weightedInnerProdKernel(Nblock, N, o_w, o_x, o_y, o_scratch);
-
-  o_scratch.copyTo(scratch, Nblock*sizeof(dfloat));
+  weightedInnerProdKernel1(Nblock, N, o_w, o_x, o_y, o_scratch);
+  weightedInnerProdKernel2(Nblock, o_scratch);
 
-  dfloat dot = 0;
-  for(dlong n=0;n<Nblock;++n){
-    dot += scratch[n];
-  }
+  h_scratch.copyFrom(o_scratch, 1, 0, properties_t("async", true));
+  platform->finish();
 
-  dfloat globaldot = 0;
-  MPI_Allreduce(&dot, &globaldot, 1, MPI_DFLOAT, MPI_SUM, comm);
+  dfloat globaldot = h_scratch[0];
+  comm.Allreduce(globaldot, Comm::Sum);
 
   return globaldot;
 }
 
 // ||o_a||_w2
-dfloat linAlg_t::weightedNorm2(const dlong N, occa::memory& o_w,
-                               occa::memory& o_a, MPI_Comm comm) {
-  //TODO, maybe complete reduction on device with second kernel?
+dfloat linAlg_t::weightedNorm2(const dlong N, deviceMemory<dfloat> o_w,
+                               deviceMemory<dfloat> o_a, comm_t comm) {
   int Nblock = (N+blocksize-1)/blocksize;
   Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries
 
-  weightedNorm2Kernel(Nblock, N, o_w, o_a, o_scratch);
+  weightedNorm2Kernel1(Nblock, N, o_w, o_a, o_scratch);
+  weightedNorm2Kernel2(Nblock, o_scratch);
 
-  o_scratch.copyTo(scratch, Nblock*sizeof(dfloat));
+  h_scratch.copyFrom(o_scratch, 1, 0, properties_t("async", true));
+  platform->finish();
 
-  dfloat norm = 0;
-  for(dlong n=0;n<Nblock;++n){
-    norm += scratch[n];
-  }
-
-  dfloat globalnorm = 0;
-  MPI_Allreduce(&norm, &globalnorm, 1, MPI_DFLOAT, MPI_SUM, comm);
+  dfloat globalnorm = h_scratch[0];
+  comm.Allreduce(globalnorm, Comm::Sum);
 
   return sqrt(globalnorm);
 }
+
+} //namespace libp
diff --git a/libs/core/matrixConditionNumber.cpp b/libs/linAlg/linAlgMatrixConditionNumber.cpp
similarity index 52%
rename from libs/core/matrixConditionNumber.cpp
rename to libs/linAlg/linAlgMatrixConditionNumber.cpp
index 3a0472c61..87f2be888 100644
--- a/libs/core/matrixConditionNumber.cpp
+++ b/libs/linAlg/linAlgMatrixConditionNumber.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -24,7 +24,7 @@ SOFTWARE.
 
 */
 
-#include "core.hpp"
+#include "linAlg.hpp"
 
 extern "C" {
   void dgetrf_(int* M, int *N, double* A, int* lda, int* IPIV, int* INFO);
@@ -39,8 +39,11 @@ extern "C" {
                 float *RCOND, float *WORK, int *IWORK, int *INFO );
 }
 
-double matrixConditionNumber(int N, double *A) {
+namespace libp {
 
+double linAlg_t::matrixConditionNumber(const int N, const memory<double> A) {
+
+  int n = N;
   int lwork = 4*N;
   int info;
 
@@ -49,47 +52,35 @@ double matrixConditionNumber(int N, double *A) {
   double Acond;
   double Anorm;
 
-  double *tmpLU = (double*) calloc(N*N, sizeof(double));
+  memory<double> tmpLU(N*N);
 
-  int *ipiv = (int*) calloc(N, sizeof(int));
-  double *work = (double*) calloc(lwork, sizeof(double));
-  int  *iwork = (int*) calloc(N, sizeof(int));
+  memory<int> ipiv(N);
+  memory<double> work(lwork);
+  memory<int> iwork(N);
 
-  for(int n=0;n<N*N;++n){
-    tmpLU[n] = (double) A[n];
-  }
+  tmpLU.copyFrom(A, N*N);
 
   //get the matrix norm of A
-  Anorm = dlange_(&norm, &N, &N, tmpLU, &N, work);
+  Anorm = dlange_(&norm, &n, &n, tmpLU.ptr(), &n, work.ptr());
 
   //compute LU factorization
-  dgetrf_ (&N, &N, tmpLU, &N, ipiv, &info);
+  dgetrf_ (&n, &n, tmpLU.ptr(), &n, ipiv.ptr(), &info);
 
-  if(info) {
-    std::stringstream ss;
-    ss << "dgetrf reports info = " << info << " when computing condition number";
-    LIBP_ABORT(ss.str());
-  }
+  LIBP_ABORT("dgetrf reports info = " << info << " when computing condition number",
+             info);
 
   //compute inverse condition number
-  dgecon_(&norm, &N, tmpLU, &N, &Anorm, &Acond, work, iwork, &info);
-
-  if(info) {
-    std::stringstream ss;
-    ss << "dgecon reports info = " << info << " when computing condition number";
-    LIBP_ABORT(ss.str());
-  }
+  dgecon_(&norm, &n, tmpLU.ptr(), &n, &Anorm, &Acond, work.ptr(), iwork.ptr(), &info);
 
-  free(work);
-  free(iwork);
-  free(ipiv);
-  free(tmpLU);
+  LIBP_ABORT("dgecon reports info = " << info << " when computing condition number",
+             info);
 
-  return (double) 1.0/Acond;
+  return 1.0/Acond;
 }
 
-float matrixConditionNumber(int N, float *A) {
+float linAlg_t::matrixConditionNumber(const int N, const memory<float> A) {
 
+  int n = N;
   int lwork = 4*N;
   int info;
 
@@ -98,41 +89,30 @@ float matrixConditionNumber(int N, float *A) {
   float Acond;
   float Anorm;
 
-  float *tmpLU = (float*) calloc(N*N, sizeof(float));
+  memory<float> tmpLU(N*N);
 
-  int *ipiv = (int*) calloc(N, sizeof(int));
-  float *work = (float*) calloc(lwork, sizeof(float));
-  int  *iwork = (int*) calloc(N, sizeof(int));
+  memory<int> ipiv(N);
+  memory<float> work(lwork);
+  memory<int> iwork(N);
 
-  for(int n=0;n<N*N;++n){
-    tmpLU[n] = (float) A[n];
-  }
+  tmpLU.copyFrom(A, N*N);
 
   //get the matrix norm of A
-  Anorm = slange_(&norm, &N, &N, tmpLU, &N, work);
+  Anorm = slange_(&norm, &n, &n, tmpLU.ptr(), &n, work.ptr());
 
   //compute LU factorization
-  sgetrf_ (&N, &N, tmpLU, &N, ipiv, &info);
+  sgetrf_ (&n, &n, tmpLU.ptr(), &n, ipiv.ptr(), &info);
 
-  if(info) {
-    std::stringstream ss;
-    ss << "sgetrf reports info = " << info << " when computing condition number";
-    LIBP_ABORT(ss.str());
-  }
+  LIBP_ABORT("sgetrf reports info = " << info << " when computing condition number",
+             info);
 
   //compute inverse condition number
-  sgecon_(&norm, &N, tmpLU, &N, &Anorm, &Acond, work, iwork, &info);
+  sgecon_(&norm, &n, tmpLU.ptr(), &n, &Anorm, &Acond, work.ptr(), iwork.ptr(), &info);
 
-  if(info) {
-    std::stringstream ss;
-    ss << "sgecon reports info = " << info << " when computing condition number";
-    LIBP_ABORT(ss.str());
-  }
+  LIBP_ABORT("sgecon reports info = " << info << " when computing condition number",
+             info);
 
-  free(work);
-  free(iwork);
-  free(ipiv);
-  free(tmpLU);
+  return 1.0/Acond;
+}
 
-  return (float) 1.0/Acond;
-}
\ No newline at end of file
+} //namespace libp
diff --git a/libs/linAlg/linAlgMatrixEig.cpp b/libs/linAlg/linAlgMatrixEig.cpp
new file mode 100644
index 000000000..23c308c79
--- /dev/null
+++ b/libs/linAlg/linAlgMatrixEig.cpp
@@ -0,0 +1,152 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "linAlg.hpp"
+
+extern "C" {
+  void sgeev_(char *JOBVL, char *JOBVR, int *N, float *A, int *LDA, float *WR, float *WI,
+              float *VL, int *LDVL, float *VR, int *LDVR, float *WORK, int *LWORK, int *INFO );
+  void dgeev_(char *JOBVL, char *JOBVR, int *N, double *A, int *LDA, double *WR, double *WI,
+              double *VL, int *LDVL, double *VR, int *LDVR, double *WORK, int *LWORK, int *INFO );
+}
+
+namespace libp {
+
+// compute right eigenvectors
+void linAlg_t::matrixEigenVectors(const int N, const memory<double> A,
+                                  memory<double> VR,
+                                  memory<double> WR,
+                                  memory<double> WI){
+
+  int n = N;
+  char JOBVL = 'N';
+  char JOBVR = 'V';
+  int LDA = N;
+  int LDVL = N;
+  int LDVR = N;
+  int LWORK = 8*N;
+
+  memory<double> WORK(LWORK);
+  memory<double> tmpA(N*LDA);
+  memory<double> tmpVR(N*LDVR);
+
+  //tmpA = A^T (row major to column-major)
+  linAlg_t::matrixTranspose(N, N, A, LDA, tmpA, LDA);
+
+  int INFO = -999;
+
+  dgeev_ (&JOBVL, &JOBVR, &n, tmpA.ptr(), &LDA, WR.ptr(), WI.ptr(),
+          nullptr, &LDVL, tmpVR.ptr(), &LDVR, WORK.ptr(), &LWORK, &INFO);
+
+  LIBP_ABORT("dgeev_ reports info = " << INFO, INFO);
+
+  //VR = tmpVR^T (column major to row major)
+  linAlg_t::matrixTranspose(N, N, tmpVR, LDVR, VR, LDVR);
+}
+
+// compute right eigenvectors
+void linAlg_t::matrixEigenVectors(const int N, const memory<float> A,
+                                  memory<float> VR,
+                                  memory<float> WR,
+                                  memory<float> WI){
+
+  int n = N;
+  char JOBVL = 'N';
+  char JOBVR = 'V';
+  int LDA = N;
+  int LDVL = N;
+  int LDVR = N;
+  int LWORK = 8*N;
+
+  memory<float> WORK(LWORK);
+  memory<float> tmpA(N*LDA);
+  memory<float> tmpVR(N*LDVR);
+
+  //tmpA = A^T (row major to column-major)
+  linAlg_t::matrixTranspose(N, N, A, LDA, tmpA, LDA);
+
+  int INFO = -999;
+
+  sgeev_ (&JOBVL, &JOBVR, &n, tmpA.ptr(), &LDA, WR.ptr(), WI.ptr(),
+          nullptr, &LDVL, tmpVR.ptr(), &LDVR, WORK.ptr(), &LWORK, &INFO);
+
+  LIBP_ABORT("sgeev_ reports info = " << INFO, INFO);
+
+  //VR = tmpVR^T (column major to row major)
+  linAlg_t::matrixTranspose(N, N, tmpVR, LDVR, VR, LDVR);
+}
+
+// compute eigenvalues
+void linAlg_t::matrixEigenValues(const int N, const memory<double> A,
+                                 memory<double> WR,
+                                 memory<double> WI){
+
+  int n = N;
+  char JOBVL  = 'N';
+  char JOBVR  = 'N';
+  int LDA = N;
+  int LDVL = N;
+  int LDVR = N;
+  int LWORK = 8*N;
+
+  double* Aptr = const_cast<double*>(A.ptr());
+
+  memory<double> WORK(LWORK);
+
+  int INFO = -999;
+
+  dgeev_ (&JOBVL, &JOBVR, &n, Aptr, &LDA, WR.ptr(), WI.ptr(),
+          nullptr, &LDVL, nullptr, &LDVR, WORK.ptr(), &LWORK, &INFO);
+
+  LIBP_ABORT("dgeev_ reports info = " << INFO, INFO);
+}
+
+// compute eigenvalues
+void linAlg_t::matrixEigenValues(const int N, const memory<float> A,
+                                 memory<float> WR,
+                                 memory<float> WI){
+
+  int n = N;
+  char JOBVL  = 'N';
+  char JOBVR  = 'N';
+  int LDA = N;
+  int LDVL = N;
+  int LDVR = N;
+  int LWORK = 8*N;
+
+  float* Aptr = const_cast<float*>(A.ptr());
+
+  memory<float> WORK(LWORK);
+
+  int INFO = -999;
+
+  sgeev_ (&JOBVL, &JOBVR, &n, Aptr, &LDA, WR.ptr(), WI.ptr(),
+          nullptr, &LDVL, nullptr, &LDVR, WORK.ptr(), &LWORK, &INFO);
+
+  LIBP_ABORT("sgeev_ reports info = " << INFO, INFO);
+}
+
+} //namespace libp
diff --git a/libs/core/matrixInverse.cpp b/libs/linAlg/linAlgMatrixInverse.cpp
similarity index 56%
rename from libs/core/matrixInverse.cpp
rename to libs/linAlg/linAlgMatrixInverse.cpp
index 9c5938513..d81f53394 100644
--- a/libs/core/matrixInverse.cpp
+++ b/libs/linAlg/linAlgMatrixInverse.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -24,7 +24,7 @@ SOFTWARE.
 
 */
 
-#include "core.hpp"
+#include "linAlg.hpp"
 
 extern "C" {
   void dgetrf_(int* M, int *N, double* A, int* lda, int* IPIV, int* INFO);
@@ -34,58 +34,42 @@ extern "C" {
   void sgetri_(int* N, float* A, int* lda, int* IPIV, float* WORK, int* lwork, int* INFO);
 }
 
-void matrixInverse(int N, double *A){
+namespace libp {
+
+void linAlg_t::matrixInverse(const int N, memory<double> A){
+  int n = N;
   int lwork = N*N;
   int info;
 
-  // compute inverse mass matrix
-  int *ipiv = (int*) calloc(N, sizeof(int));
-  double *work = (double*) calloc(lwork, sizeof(double));
-
-  dgetrf_ (&N, &N, A, &N, ipiv, &info);
+  // compute inverse matrix in-place
+  memory<int> ipiv(N);
+  memory<double> work(lwork);
 
-  if(info) {
-    std::stringstream ss;
-    ss << "dgetrf_ reports info = " << info;
-    LIBP_ABORT(ss.str());
-  }
+  dgetrf_ (&n, &n, A.ptr(), &n, ipiv.ptr(), &info);
 
-  dgetri_ (&N, A, &N, ipiv, work, &lwork, &info);
+  LIBP_ABORT("dgetrf_ reports info = " << info, info);
 
-  if(info) {
-    std::stringstream ss;
-    ss << "dgetri_ reports info = " << info;
-    LIBP_ABORT(ss.str());
-  }
+  dgetri_ (&n, A.ptr(), &n, ipiv.ptr(), work.ptr(), &lwork, &info);
 
-  free(work);
-  free(ipiv);
+  LIBP_ABORT("dgetri_ reports info = " << info, info);
 }
 
-void matrixInverse(int N, float *A){
+void linAlg_t::matrixInverse(const int N, memory<float> A){
+  int n = N;
   int lwork = N*N;
   int info;
 
-  // compute inverse mass matrix
-  int *ipiv = (int*) calloc(N, sizeof(int));
-  float *work = (float*) calloc(lwork, sizeof(float));
+  // compute inverse matrix in-place
+  memory<int> ipiv(N);
+  memory<float> work(lwork);
 
-  sgetrf_ (&N, &N, A, &N, ipiv, &info);
+  sgetrf_ (&n, &n, A.ptr(), &n, ipiv.ptr(), &info);
 
-  if(info) {
-    std::stringstream ss;
-    ss << "sgetrf_ reports info = " << info;
-    LIBP_ABORT(ss.str());
-  }
+  LIBP_ABORT("sgetrf_ reports info = " << info, info);
 
-  sgetri_ (&N, A, &N, ipiv, work, &lwork, &info);
+  sgetri_ (&n, A.ptr(), &n, ipiv.ptr(), work.ptr(), &lwork, &info);
 
-  if(info) {
-    std::stringstream ss;
-    ss << "sgetri_ reports info = " << info;
-    LIBP_ABORT(ss.str());
-  }
-
-  free(work);
-  free(ipiv);
+  LIBP_ABORT("sgetri_ reports info = " << info, info);
 }
+
+} //namespace libp
diff --git a/libs/linAlg/linAlgMatrixRightSolve.cpp b/libs/linAlg/linAlgMatrixRightSolve.cpp
new file mode 100644
index 000000000..05019cf62
--- /dev/null
+++ b/libs/linAlg/linAlgMatrixRightSolve.cpp
@@ -0,0 +1,381 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim WarburtonTim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "linAlg.hpp"
+
+extern "C" {
+  void dgesv_ ( int     *N, int     *NRHS, double  *A,
+                int     *LDA,
+                int     *IPIV,
+                double  *B,
+                int     *LDB,
+                int     *INFO );
+
+  void sgesv_ ( int     *N, int     *NRHS, float  *A,
+                int     *LDA,
+                int     *IPIV,
+                float  *B,
+                int     *LDB,
+                int     *INFO );
+
+  void dgels_ ( char   *TRANS,
+                int    *M,
+                int    *N,
+                int    *NRHS,
+                double *A,
+                int    *LDA,
+                double *B,
+                int    *LDB,
+                double *WORK,
+                int    *LWORK,
+                int    *INFO);
+
+  void sgels_ ( char   *TRANS,
+                int    *M,
+                int    *N,
+                int    *NRHS,
+                float  *A,
+                int    *LDA,
+                float  *B,
+                int    *LDB,
+                float  *WORK,
+                int    *LWORK,
+                int    *INFO);
+
+  void dgeqp3_( int    *M,
+                int    *N,
+                double *A,
+                int    *LDA,
+                int    *JPVT,
+                double *TAU,
+                double *WORK,
+                int    *LWORK,
+                int    *INFO);
+
+  void sgeqp3_( int    *M,
+                int    *N,
+                float  *A,
+                int    *LDA,
+                int    *JPVT,
+                float  *TAU,
+                float  *WORK,
+                int    *LWORK,
+                int    *INFO);
+
+  void dormqr_( char   *SIDE,
+                char   *TRANS,
+                int    *M,
+                int    *N,
+                int    *K,
+                double *A,
+                int    *LDA,
+                double *TAU,
+                double *C,
+                int    *LDC,
+                double *WORK,
+                int    *LWORK,
+                int    *INFO);
+
+  void sormqr_( char   *SIDE,
+                char   *TRANS,
+                int    *M,
+                int    *N,
+                int    *K,
+                float  *A,
+                int    *LDA,
+                float  *TAU,
+                float  *C,
+                int    *LDC,
+                float  *WORK,
+                int    *LWORK,
+                int    *INFO);
+
+  void dtrsm_ ( char   *SIDE,
+                char   *UPLO,
+                char   *TRANSA,
+                char   *DIAG,
+                int    *M,
+                int    *N,
+                double *ALPHA,
+                double *A,
+                int    *LDA,
+                double *B,
+                int    *LDB);
+
+  void strsm_ ( char   *SIDE,
+                char   *UPLO,
+                char   *TRANSA,
+                char   *DIAG,
+                int    *M,
+                int    *N,
+                float  *ALPHA,
+                float  *A,
+                int    *LDA,
+                float  *B,
+                int    *LDB);
+}
+
+namespace libp {
+
+// C = A/B  = trans(trans(B)\trans(A))
+// assume row major
+void linAlg_t::matrixRightSolve(const int NrowsA, const int NcolsA, const memory<double> A,
+                                const int NrowsB, const int NcolsB, const memory<double> B,
+                                memory<double> C){
+
+  int info;
+
+  int NrowsX = NcolsB;
+  int NcolsX = NrowsB;
+
+  int NrowsY = NcolsA;
+  int NcolsY = NrowsA;
+
+  int lwork = NrowsX*NcolsX;
+
+  // compute inverse mass matrix
+  memory<double> tmpX(NrowsX*NcolsX);
+  memory<int>    ipiv(NrowsX);
+  memory<double> work(lwork);
+
+  tmpX.copyFrom(B, NrowsX*NcolsX);
+  C.copyFrom(A, NrowsY*NcolsY);
+
+  dgesv_(&NrowsX, &NcolsY, tmpX.ptr(), &NrowsX, ipiv.ptr(), C.ptr(), &NrowsY, &info);
+
+  LIBP_ABORT("dgesv_ reports info = " << info, info);
+}
+
+// C = A/B  = trans(trans(B)\trans(A))
+// assume row major
+void linAlg_t::matrixRightSolve(const int NrowsA, const int NcolsA, const memory<float> A,
+                                const int NrowsB, const int NcolsB, const memory<float> B,
+                                memory<float> C){
+
+  int info;
+
+  int NrowsX = NcolsB;
+  int NcolsX = NrowsB;
+
+  int NrowsY = NcolsA;
+  int NcolsY = NrowsA;
+
+  int lwork = NrowsX*NcolsX;
+
+  // compute inverse mass matrix
+  memory<float> tmpX(NrowsX*NcolsX);
+  memory<int>   ipiv(NrowsX);
+  memory<float> work(lwork);
+
+  tmpX.copyFrom(B, NrowsX*NcolsX);
+  C.copyFrom(A, NrowsY*NcolsY);
+
+  sgesv_(&NrowsX, &NcolsY, tmpX.ptr(), &NrowsX, ipiv.ptr(), C.ptr(), &NrowsY, &info); // ?
+
+  LIBP_ABORT("sgesv_ reports info = " << info, info);
+}
+
+// Find minimum-norm solution to xA = b with NrowsA > NcolsA (underdetermined).
+//
+// NB:  A must be stored ROW MAJOR.
+void linAlg_t::matrixUnderdeterminedRightSolveMinNorm(const int NrowsA, const int NcolsA,
+                                                      const memory<double> A, const memory<double> b,
+                                                      memory<double> x) {
+  // Solve A^T x^T = b^T.  Note TRANS = 'N', since A is row major.
+  int  INFO  = 0;
+  char TRANS = 'N';
+  int  NRHS  = 1;
+  int  LWORK = 2*NrowsA*NcolsA;
+  int  Nrows = NrowsA;
+  int  Ncols = NcolsA;
+
+  memory<double> WORK(LWORK);
+  memory<double> tmpA(NrowsA*NcolsA);
+  memory<double> tmpb(NrowsA);
+
+  tmpA.copyFrom(A, NrowsA*NcolsA);
+  tmpb.copyFrom(b, NcolsA);
+
+  dgels_(&TRANS, &Ncols, &Nrows, &NRHS, tmpA.ptr(), &Ncols, tmpb.ptr(), &Nrows, WORK.ptr(), &LWORK, &INFO);
+
+  LIBP_ABORT("dgels_ returned INFO = " << INFO, INFO);
+
+  // Copy to output.
+  x.copyFrom(tmpb, NrowsA);
+}
+
+// Find minimum-norm solution to xA = b with NrowsA > NcolsA (underdetermined).
+//
+// NB:  A must be stored ROW MAJOR.
+void linAlg_t::matrixUnderdeterminedRightSolveMinNorm(const int NrowsA, const int NcolsA,
+                                                      const memory<float> A, const memory<float> b,
+                                                      memory<float> x) {
+  // Solve A^T x^T = b^T.  Note TRANS = 'N', since A is row major.
+  int  INFO  = 0;
+  char TRANS = 'N';
+  int  NRHS  = 1;
+  int  LWORK = 2*NrowsA*NcolsA;
+  int  Nrows = NrowsA;
+  int  Ncols = NcolsA;
+
+  memory<float> WORK(LWORK);
+  memory<float> tmpA(NrowsA*NcolsA);
+  memory<float> tmpb(NrowsA);
+
+  tmpA.copyFrom(A, NrowsA*NcolsA);
+  tmpb.copyFrom(b, NcolsA);
+
+  sgels_(&TRANS, &Ncols, &Nrows, &NRHS, tmpA.ptr(), &Ncols, tmpb.ptr(), &Nrows, WORK.ptr(), &LWORK, &INFO);
+
+  LIBP_ABORT("dgels_ returned INFO = " << INFO, INFO);
+
+  // Copy to output.
+  x.copyFrom(tmpb, NrowsA);
+}
+
+// Solve xA = b with NrowsA > NcolsA (underdetermined) using column-pivoted QR.
+//
+// Done by solving A^T x^T = b^T in 4 steps:
+//   1.  Decompose A^T * P = Q * R.  -->  Q * R * P^T x^T = b^T
+//   2.  Multiply by Q^T.            -->  R * P^T x^T = Q^T b^T
+//   3.  Backsolve with R1.          -->  P^T * x^T = R1^{-1} Q^T b^T
+//       where R1 = leading NcolsA * NcolsA submatrix of R.
+//   4.  Apply permutation.          -->  x^T = P R1^{-1} Q^T b^T
+//
+// NB:  A must be stored ROW MAJOR.
+void linAlg_t::matrixUnderdeterminedRightSolveCPQR(const int NrowsA, const int NcolsA,
+                                                   const memory<double> A, const memory<double> b,
+                                                   memory<double> x) {
+  int INFO  = 0;
+  int LWORK = 3*NrowsA + 1;
+  int Nrows = NrowsA;
+  int Ncols = NcolsA;
+
+  memory<int>    JPVT(NrowsA, 0);
+  memory<double> TAU(std::min(NrowsA, NcolsA));
+
+  memory<double> WORK;
+  memory<double> tmpA(NrowsA*NcolsA);
+  memory<double> tmpb(NrowsA, 0.0);
+
+  WORK.malloc(LWORK);
+  tmpA.copyFrom(A, NrowsA*NcolsA);
+  tmpb.copyFrom(b, NcolsA);
+
+  // Compute A^T * P = Q * R.
+  dgeqp3_(&Ncols, &Nrows, tmpA.ptr(), &Ncols, JPVT.ptr(), TAU.ptr(), WORK.ptr(), &LWORK, &INFO);
+
+  LIBP_ABORT("dgeqp3_ returned INFO = " << INFO, INFO);
+
+  // Compute Q^T * b^T.
+  char SIDE = 'L';
+  char TRANS = 'T';
+  int  NRHS = 1;
+  int  NREFLS = NcolsA;
+
+  LWORK = 1;
+  WORK.malloc(LWORK);
+  dormqr_(&SIDE, &TRANS, &Ncols, &NRHS, &NREFLS, tmpA.ptr(), &Ncols, TAU.ptr(), tmpb.ptr(), &Ncols, WORK.ptr(), &LWORK, &INFO);
+
+  LIBP_ABORT("dormqr_ returned INFO = " << INFO, INFO);
+
+  // Compute R1^{-1} * Q^T * b^T
+  SIDE = 'L';
+  char UPLO = 'U';
+  char TRANSA = 'N';
+  char DIAG = 'N';
+  NRHS = 1;
+  double ALPHA = 1.0;
+
+  dtrsm_(&SIDE, &UPLO, &TRANSA, &DIAG, &Ncols, &NRHS, &ALPHA, tmpA.ptr(), &Ncols, tmpb.ptr(), &Ncols);
+
+  // Apply the permutation.
+  for (int i = 0; i < NrowsA; i++)
+    x[JPVT[i] - 1] = tmpb[i];
+}
+
+// Solve xA = b with NrowsA > NcolsA (underdetermined) using column-pivoted QR.
+//
+// Done by solving A^T x^T = b^T in 4 steps:
+//   1.  Decompose A^T * P = Q * R.  -->  Q * R * P^T x^T = b^T
+//   2.  Multiply by Q^T.            -->  R * P^T x^T = Q^T b^T
+//   3.  Backsolve with R1.          -->  P^T * x^T = R1^{-1} Q^T b^T
+//       where R1 = leading NcolsA * NcolsA submatrix of R.
+//   4.  Apply permutation.          -->  x^T = P R1^{-1} Q^T b^T
+//
+// NB:  A must be stored ROW MAJOR.
+void linAlg_t::matrixUnderdeterminedRightSolveCPQR(const int NrowsA, const int NcolsA,
+                                                   const memory<float> A, const memory<float> b,
+                                                   memory<float> x) {
+  int INFO  = 0;
+  int LWORK = 3*NrowsA + 1;
+  int Nrows = NrowsA;
+  int Ncols = NcolsA;
+
+  memory<int>    JPVT(NrowsA, 0);
+  memory<float> TAU(std::min(NrowsA, NcolsA));
+
+  memory<float> WORK;
+  memory<float> tmpA(NrowsA*NcolsA);
+  memory<float> tmpb(NrowsA, 0.0);
+
+  WORK.malloc(LWORK);
+  tmpA.copyFrom(A, NrowsA*NcolsA);
+  tmpb.copyFrom(b, NcolsA);
+
+  // Compute A^T * P = Q * R.
+  sgeqp3_(&Ncols, &Nrows, tmpA.ptr(), &Ncols, JPVT.ptr(), TAU.ptr(), WORK.ptr(), &LWORK, &INFO);
+
+  LIBP_ABORT("dgeqp3_ returned INFO = " << INFO, INFO);
+
+  // Compute Q^T * b^T.
+  char SIDE = 'L';
+  char TRANS = 'T';
+  int  NRHS = 1;
+  int  NREFLS = NcolsA;
+
+  LWORK = 1;
+  WORK.malloc(LWORK);
+  sormqr_(&SIDE, &TRANS, &Ncols, &NRHS, &NREFLS, tmpA.ptr(), &Ncols, TAU.ptr(), tmpb.ptr(), &Ncols, WORK.ptr(), &LWORK, &INFO);
+
+  LIBP_ABORT("dormqr_ returned INFO = " << INFO, INFO);
+
+  // Compute R1^{-1} * Q^T * b^T
+  SIDE = 'L';
+  char UPLO = 'U';
+  char TRANSA = 'N';
+  char DIAG = 'N';
+  NRHS = 1;
+  float ALPHA = 1.0;
+
+  strsm_(&SIDE, &UPLO, &TRANSA, &DIAG, &Ncols, &NRHS, &ALPHA, tmpA.ptr(), &Ncols, tmpb.ptr(), &Ncols);
+
+  // Apply the permutation.
+  for (int i = 0; i < NrowsA; i++)
+    x[JPVT[i] - 1] = tmpb[i];
+}
+
+} //namespace libp
diff --git a/libs/core/matrixTranspose.cpp b/libs/linAlg/linAlgMatrixTranspose.cpp
similarity index 61%
rename from libs/core/matrixTranspose.cpp
rename to libs/linAlg/linAlgMatrixTranspose.cpp
index 0277cdd32..49c06949e 100644
--- a/libs/core/matrixTranspose.cpp
+++ b/libs/linAlg/linAlgMatrixTranspose.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -24,13 +24,15 @@ SOFTWARE.
 
 */
 
-#include "core.hpp"
+#include "linAlg.hpp"
+
+namespace libp {
 
 template<typename T>
-inline
+static inline
 void matrixTranspose_t(const int M, const int N,
-                       const T  *A, const int LDA,
-                             T *AT, const int LDAT){
+                       const memory<T>  A, const int LDA,
+                             memory<T> AT, const int LDAT){
 
   //A & A^T - Row major ordering
   //M = number of rows of A, columns of A^T
@@ -42,10 +44,8 @@ void matrixTranspose_t(const int M, const int N,
   if (N<1 || M<1) return;
 
   //check for weird input
-  if (LDA<N || LDAT<M) {
-    printf("Bad input to matrixTranspose\n");
-    return;
-  }
+  LIBP_ABORT("Bad input to matrixTranspose\n",
+             LDA<N || LDAT<M);
 
   for (int n=0;n<N;n++) { //for all cols of A^T
     for (int m=0;m<M;m++) { //for all rows of A^T
@@ -54,26 +54,28 @@ void matrixTranspose_t(const int M, const int N,
   }
 }
 
-void matrixTranspose(const int M, const int N,
-                     const float  *A, const int LDA,
-                           float *AT, const int LDAT) {
+void linAlg_t::matrixTranspose(const int M, const int N,
+                     const memory<float>  A, const int LDA,
+                           memory<float> AT, const int LDAT) {
   matrixTranspose_t(M, N, A, LDA, AT, LDAT);
 }
 
-void matrixTranspose(const int M, const int N,
-                     const double  *A, const int LDA,
-                           double *AT, const int LDAT) {
+void linAlg_t::matrixTranspose(const int M, const int N,
+                     const memory<double>  A, const int LDA,
+                           memory<double> AT, const int LDAT) {
   matrixTranspose_t(M, N, A, LDA, AT, LDAT);
 }
 
-void matrixTranspose(const int M, const int N,
-                     const int  *A, const int LDA,
-                           int *AT, const int LDAT) {
+void linAlg_t::matrixTranspose(const int M, const int N,
+                     const memory<int>  A, const int LDA,
+                           memory<int> AT, const int LDAT) {
   matrixTranspose_t(M, N, A, LDA, AT, LDAT);
 }
 
-void matrixTranspose(const int M, const int N,
-                     const long long int  *A, const int LDA,
-                           long long int *AT, const int LDAT) {
+void linAlg_t::matrixTranspose(const int M, const int N,
+                     const memory<long long int>  A, const int LDA,
+                           memory<long long int> AT, const int LDAT) {
   matrixTranspose_t(M, N, A, LDA, AT, LDAT);
-}
\ No newline at end of file
+}
+
+} //namespace libp
diff --git a/libs/linAlg/linAlgSetup.cpp b/libs/linAlg/linAlgSetup.cpp
index 640a75096..1e6b6ca6c 100644
--- a/libs/linAlg/linAlgSetup.cpp
+++ b/libs/linAlg/linAlgSetup.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -28,32 +28,30 @@ SOFTWARE.
 #include "linAlg.hpp"
 #include "platform.hpp"
 
-#define LINALG_BLOCKSIZE 512
+namespace libp {
 
-linAlg_t::linAlg_t(): blocksize(LINALG_BLOCKSIZE) {};
+linAlg_t::linAlg_t() {};
 
 void linAlg_t::Setup(platform_t *_platform) {
 
   platform = _platform;
-  kernelInfo = platform->props;
+  kernelInfo = platform->props();
 
   //add defines
-  kernelInfo["defines/" "p_blockSize"] = (int)LINALG_BLOCKSIZE;
-
+  kernelInfo["defines/" "p_blockSize"] = blocksize;
   kernelInfo["defines/init_dfloat_min"] =  std::numeric_limits<dfloat>::max();
   kernelInfo["defines/init_dfloat_max"] = -std::numeric_limits<dfloat>::max();
 
   //pinned scratch buffer
-  scratch = (dfloat*) platform->hostMalloc(LINALG_BLOCKSIZE*sizeof(dfloat),
-                                           NULL, h_scratch);
-  o_scratch = platform->malloc(LINALG_BLOCKSIZE*sizeof(dfloat));
+  h_scratch = platform->hostMalloc<dfloat>(blocksize);
+  o_scratch = platform->malloc<dfloat>(blocksize);
 }
 
 //initialize list of kernels
-void linAlg_t::InitKernels(vector<string> kernels) {
+void linAlg_t::InitKernels(std::vector<std::string> kernels) {
 
   for (size_t i=0;i<kernels.size();i++) {
-    string name = kernels[i];
+    std::string name = kernels[i];
     if (name=="set") {
       if (setKernel.isInitialized()==false)
         setKernel = platform->buildKernel(LINALG_DIR "/okl/"
@@ -121,72 +119,86 @@ void linAlg_t::InitKernels(vector<string> kernels) {
                                         "zadxpy",
                                         kernelInfo);
     } else if (name=="min") {
-      if (minKernel.isInitialized()==false)
-        minKernel = platform->buildKernel(LINALG_DIR "/okl/"
+      if (minKernel1.isInitialized()==false) {
+        minKernel1 = platform->buildKernel(LINALG_DIR "/okl/"
+                                        "linAlgMin.okl",
+                                        "min1",
+                                        kernelInfo);
+        minKernel2 = platform->buildKernel(LINALG_DIR "/okl/"
                                         "linAlgMin.okl",
-                                        "min",
+                                        "min2",
                                         kernelInfo);
+      }
     } else if (name=="max") {
-      if (maxKernel.isInitialized()==false)
-        maxKernel = platform->buildKernel(LINALG_DIR "/okl/"
+      if (maxKernel1.isInitialized()==false) {
+        maxKernel1 = platform->buildKernel(LINALG_DIR "/okl/"
+                                        "linAlgMax.okl",
+                                        "max1",
+                                        kernelInfo);
+        maxKernel2 = platform->buildKernel(LINALG_DIR "/okl/"
                                         "linAlgMax.okl",
-                                        "max",
+                                        "max2",
                                         kernelInfo);
+      }
     } else if (name=="sum") {
-      if (sumKernel.isInitialized()==false)
-        sumKernel = platform->buildKernel(LINALG_DIR "/okl/"
+      if (sumKernel1.isInitialized()==false) {
+        sumKernel1 = platform->buildKernel(LINALG_DIR "/okl/"
+                                        "linAlgSum.okl",
+                                        "sum1",
+                                        kernelInfo);
+        sumKernel2 = platform->buildKernel(LINALG_DIR "/okl/"
                                         "linAlgSum.okl",
-                                        "sum",
+                                        "sum2",
                                         kernelInfo);
+      }
     } else if (name=="norm2") {
-      if (norm2Kernel.isInitialized()==false)
-        norm2Kernel = platform->buildKernel(LINALG_DIR "/okl/"
+      if (norm2Kernel1.isInitialized()==false) {
+        norm2Kernel1 = platform->buildKernel(LINALG_DIR "/okl/"
                                         "linAlgNorm2.okl",
-                                        "norm2",
+                                        "norm2_1",
                                         kernelInfo);
+        norm2Kernel2 = platform->buildKernel(LINALG_DIR "/okl/"
+                                        "linAlgNorm2.okl",
+                                        "norm2_2",
+                                        kernelInfo);
+      }
     } else if (name=="weightedNorm2") {
-      if (weightedNorm2Kernel.isInitialized()==false)
-        weightedNorm2Kernel = platform->buildKernel(LINALG_DIR "/okl/"
+      if (weightedNorm2Kernel1.isInitialized()==false) {
+        weightedNorm2Kernel1 = platform->buildKernel(LINALG_DIR "/okl/"
+                                        "linAlgWeightedNorm2.okl",
+                                        "weightedNorm2_1",
+                                        kernelInfo);
+        weightedNorm2Kernel2 = platform->buildKernel(LINALG_DIR "/okl/"
                                         "linAlgWeightedNorm2.okl",
-                                        "weightedNorm2",
+                                        "weightedNorm2_1",
                                         kernelInfo);
+      }
     } else if (name=="innerProd") {
-      if (innerProdKernel.isInitialized()==false)
-        innerProdKernel = platform->buildKernel(LINALG_DIR "/okl/"
+      if (innerProdKernel1.isInitialized()==false) {
+        innerProdKernel1 = platform->buildKernel(LINALG_DIR "/okl/"
                                         "linAlgInnerProd.okl",
-                                        "innerProd",
+                                        "innerProd1",
                                         kernelInfo);
+        innerProdKernel2 = platform->buildKernel(LINALG_DIR "/okl/"
+                                        "linAlgInnerProd.okl",
+                                        "innerProd2",
+                                        kernelInfo);
+      }
     } else if (name=="weightedInnerProd") {
-      if (weightedInnerProdKernel.isInitialized()==false)
-        weightedInnerProdKernel = platform->buildKernel(LINALG_DIR "/okl/"
+      if (weightedInnerProdKernel1.isInitialized()==false) {
+        weightedInnerProdKernel1 = platform->buildKernel(LINALG_DIR "/okl/"
+                                        "linAlgWeightedInnerProd.okl",
+                                        "weightedInnerProd1",
+                                        kernelInfo);
+        weightedInnerProdKernel2 = platform->buildKernel(LINALG_DIR "/okl/"
                                         "linAlgWeightedInnerProd.okl",
-                                        "weightedInnerProd",
+                                        "weightedInnerProd2",
                                         kernelInfo);
+      }
     } else {
-      stringstream ss;
-      ss << "Requested linAlg routine \"" << name << "\" not found";
-      LIBP_ABORT(ss.str());
+      LIBP_FORCE_ABORT("Requested linAlg routine \"" << name << "\" not found");
     }
   }
 }
 
-linAlg_t::~linAlg_t() {
-  setKernel.free();
-  addKernel.free();
-  scaleKernel.free();
-  axpyKernel.free();
-  zaxpyKernel.free();
-  amxKernel.free();
-  amxpyKernel.free();
-  zamxpyKernel.free();
-  adxKernel.free();
-  adxpyKernel.free();
-  zadxpyKernel.free();
-  minKernel.free();
-  maxKernel.free();
-  sumKernel.free();
-  norm2Kernel.free();
-  weightedNorm2Kernel.free();
-  innerProdKernel.free();
-  weightedInnerProdKernel.free();
-}
\ No newline at end of file
+} //namespace libp
diff --git a/libs/linAlg/okl/linAlgADXPY.okl b/libs/linAlg/okl/linAlgADXPY.okl
index dacd3bfa8..fc5fa9f89 100644
--- a/libs/linAlg/okl/linAlgADXPY.okl
+++ b/libs/linAlg/okl/linAlgADXPY.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -60,4 +60,4 @@ SOFTWARE.
   for(dlong n=0;n<N;++n;@tile(p_blockSize,@outer,@inner)){
     z[n] = alpha*x[n]/a[n] + beta*y[n];
   }
-}
\ No newline at end of file
+}
diff --git a/libs/linAlg/okl/linAlgAMXPY.okl b/libs/linAlg/okl/linAlgAMXPY.okl
index d58dca2ea..3cdc2d9c1 100644
--- a/libs/linAlg/okl/linAlgAMXPY.okl
+++ b/libs/linAlg/okl/linAlgAMXPY.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/libs/linAlg/okl/linAlgAXPY.okl b/libs/linAlg/okl/linAlgAXPY.okl
index f59062fb1..6b32bb4aa 100644
--- a/libs/linAlg/okl/linAlgAXPY.okl
+++ b/libs/linAlg/okl/linAlgAXPY.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -65,4 +65,4 @@ SOFTWARE.
     }
   }
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/libs/linAlg/okl/linAlgAdd.okl b/libs/linAlg/okl/linAlgAdd.okl
index 3ad1f6ce0..a050f7e09 100644
--- a/libs/linAlg/okl/linAlgAdd.okl
+++ b/libs/linAlg/okl/linAlgAdd.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/libs/linAlg/okl/linAlgInnerProd.okl b/libs/linAlg/okl/linAlgInnerProd.okl
index c84ac85b6..d5b322a3d 100644
--- a/libs/linAlg/okl/linAlgInnerProd.okl
+++ b/libs/linAlg/okl/linAlgInnerProd.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,7 +25,7 @@ SOFTWARE.
 */
 
 
-@kernel void innerProd(const dlong Nblocks,
+@kernel void innerProd1(const dlong Nblocks,
                        const dlong N,
                        @restrict const  dfloat *x,
                        @restrict const  dfloat *y,
@@ -34,54 +34,71 @@ SOFTWARE.
 
   for(dlong b=0;b<Nblocks;++b;@outer(0)){
 
-    @shared volatile dfloat s_dot[p_blockSize];
+    @shared dfloat s_dot[p_blockSize];
 
     for(int t=0;t<p_blockSize;++t;@inner(0)){
       dlong id = t + b*p_blockSize;
-      s_dot[t] = 0.0;
+
+      dfloat r_dot = 0.0;
       while (id<N) {
-        s_dot[t] += x[id]*y[id];
+        r_dot += x[id]*y[id];
         id += p_blockSize*Nblocks;
       }
+      s_dot[t] = r_dot;
     }
 
-    @barrier("local");
-
 #if p_blockSize>512
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<512) s_dot[t] += s_dot[t+512];
-    @barrier("local");
 #endif
-
 #if p_blockSize>256
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<256) s_dot[t] += s_dot[t+256];
-    @barrier("local");
 #endif
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<128) s_dot[t] += s_dot[t+128];
-    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 64) s_dot[t] += s_dot[t+ 64];
-    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 32) s_dot[t] += s_dot[t+ 32];
-    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 16) s_dot[t] += s_dot[t+ 16];
-    //    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  8) s_dot[t] += s_dot[t+  8];
-    //    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  4) s_dot[t] += s_dot[t+  4];
-    //    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  2) s_dot[t] += s_dot[t+  2];
-    //    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  1) dot[b] = s_dot[0] + s_dot[1];
   }
 }
 
+@kernel void innerProd2(const dlong Nblocks, @restrict dfloat *dot){
+
+
+  for(dlong b=0;b<1;++b;@outer(0)){
+
+    @shared dfloat s_dot[p_blockSize];
+
+    for(int t=0;t<p_blockSize;++t;@inner(0)){
+      dlong id = t;
+
+      dfloat r_dot = 0.0;
+      while (id<Nblocks) {
+        r_dot += dot[id];
+        id += p_blockSize;
+      }
+      s_dot[t] = r_dot;
+    }
+
+#if p_blockSize>512
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<512) s_dot[t] += s_dot[t+512];
+#endif
+#if p_blockSize>256
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<256) s_dot[t] += s_dot[t+256];
+#endif
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<128) s_dot[t] += s_dot[t+128];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 64) s_dot[t] += s_dot[t+ 64];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 32) s_dot[t] += s_dot[t+ 32];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 16) s_dot[t] += s_dot[t+ 16];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  8) s_dot[t] += s_dot[t+  8];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  4) s_dot[t] += s_dot[t+  4];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  2) s_dot[t] += s_dot[t+  2];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  1) dot[0] = s_dot[0] + s_dot[1];
+  }
+}
+
 #if 0
 @kernel void innerProductAtomic(const dlong N,
                                const dlong Nblocks,
@@ -140,4 +157,4 @@ SOFTWARE.
 }
 
 
-#endif
\ No newline at end of file
+#endif
diff --git a/libs/linAlg/okl/linAlgMax.okl b/libs/linAlg/okl/linAlgMax.okl
index 69a3e5cc7..19e87e022 100644
--- a/libs/linAlg/okl/linAlgMax.okl
+++ b/libs/linAlg/okl/linAlgMax.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -24,35 +24,87 @@ SOFTWARE.
 
 */
 
-@kernel void max(const dlong Nblocks,
-                 const dlong N,
-                 @restrict const  dfloat *x,
-                 @restrict        dfloat *max){
+@kernel void max1(const dlong Nblocks,
+                  const dlong N,
+                  @restrict const  dfloat *x,
+                  @restrict        dfloat *max){
 
 
   for(dlong b=0;b<Nblocks;++b;@outer(0)){
 
-    @shared volatile dfloat s_max[p_blockSize];
+    @shared dfloat s_max[p_blockSize];
 
     for(int t=0;t<p_blockSize;++t;@inner(0)){
       dlong id = t + b*p_blockSize;
-      s_max[t] = init_dfloat_max;
+
+      dfloat r_max = init_dfloat_max;
       while (id<N) {
-        s_max[t] = (x[id]>s_max[t]) ? x[id] : s_max[t];
-        id += p_blockSize*Nblocks;
+        r_max = (x[id]>r_max) ? x[id] : r_max;
+        id += p_blockSize;
       }
+      s_max[t] = r_max;
     }
 
 #if p_blockSize>512
     for(int t=0;t<p_blockSize;++t;@inner(0))
       if(t<512) s_max[t] = (s_max[t+512]>s_max[t]) ? s_max[t+512] : s_max[t];
 #endif
-
 #if p_blockSize>256
     for(int t=0;t<p_blockSize;++t;@inner(0))
       if(t<256) s_max[t] = (s_max[t+256]>s_max[t]) ? s_max[t+256] : s_max[t];
 #endif
+    for(int t=0;t<p_blockSize;++t;@inner(0))
+      if(t<128) s_max[t] = (s_max[t+128]>s_max[t]) ? s_max[t+128] : s_max[t];
+
+    for(int t=0;t<p_blockSize;++t;@inner(0))
+      if(t< 64) s_max[t] = (s_max[t+ 64]>s_max[t]) ? s_max[t+ 64] : s_max[t];
+
+    for(int t=0;t<p_blockSize;++t;@inner(0))
+      if(t< 32) s_max[t] = (s_max[t+ 32]>s_max[t]) ? s_max[t+ 32] : s_max[t];
+
+    for(int t=0;t<p_blockSize;++t;@inner(0))
+      if(t< 16) s_max[t] = (s_max[t+ 16]>s_max[t]) ? s_max[t+ 16] : s_max[t];
+
+    for(int t=0;t<p_blockSize;++t;@inner(0))
+      if(t<  8) s_max[t] = (s_max[t+  8]>s_max[t]) ? s_max[t+  8] : s_max[t];
+
+    for(int t=0;t<p_blockSize;++t;@inner(0))
+      if(t<  4) s_max[t] = (s_max[t+  4]>s_max[t]) ? s_max[t+  4] : s_max[t];
+
+    for(int t=0;t<p_blockSize;++t;@inner(0))
+      if(t<  2) s_max[t] = (s_max[t+  2]>s_max[t]) ? s_max[t+  2] : s_max[t];
 
+    for(int t=0;t<p_blockSize;++t;@inner(0))
+      if(t<  1) max[b] = (s_max[0]>s_max[1]) ? s_max[0] : s_max[1];
+  }
+}
+
+@kernel void max2(const dlong Nblocks, @restrict dfloat *max){
+
+
+  for(dlong b=0;b<1;++b;@outer(0)){
+
+    @shared dfloat s_max[p_blockSize];
+
+    for(int t=0;t<p_blockSize;++t;@inner(0)){
+      dlong id = t;
+
+      dfloat r_max = init_dfloat_max;
+      while (id<Nblocks) {
+        r_max = (max[id]>r_max) ? max[id] : r_max;
+        id += p_blockSize;
+      }
+      s_max[t] = r_max;
+    }
+
+#if p_blockSize>512
+    for(int t=0;t<p_blockSize;++t;@inner(0))
+      if(t<512) s_max[t] = (s_max[t+512]>s_max[t]) ? s_max[t+512] : s_max[t];
+#endif
+#if p_blockSize>256
+    for(int t=0;t<p_blockSize;++t;@inner(0))
+      if(t<256) s_max[t] = (s_max[t+256]>s_max[t]) ? s_max[t+256] : s_max[t];
+#endif
     for(int t=0;t<p_blockSize;++t;@inner(0))
       if(t<128) s_max[t] = (s_max[t+128]>s_max[t]) ? s_max[t+128] : s_max[t];
 
diff --git a/libs/linAlg/okl/linAlgMin.okl b/libs/linAlg/okl/linAlgMin.okl
index 78c586118..28e0a7d84 100644
--- a/libs/linAlg/okl/linAlgMin.okl
+++ b/libs/linAlg/okl/linAlgMin.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -24,35 +24,87 @@ SOFTWARE.
 
 */
 
-@kernel void min(const dlong Nblocks,
-                 const dlong N,
-                 @restrict const  dfloat *x,
-                 @restrict        dfloat *min){
+@kernel void min1(const dlong Nblocks,
+                  const dlong N,
+                  @restrict const  dfloat *x,
+                  @restrict        dfloat *min){
 
 
   for(dlong b=0;b<Nblocks;++b;@outer(0)){
 
-    @shared volatile dfloat s_min[p_blockSize];
+    @shared dfloat s_min[p_blockSize];
 
     for(int t=0;t<p_blockSize;++t;@inner(0)){
       dlong id = t + b*p_blockSize;
-      s_min[t] = init_dfloat_min;
+
+      dfloat r_min = init_dfloat_min;
       while (id<N) {
-        s_min[t] = (x[id]<s_min[t]) ? x[id] : s_min[t];
-        id += p_blockSize*Nblocks;
+        r_min = (x[id]<r_min) ? x[id] : r_min;
+        id += p_blockSize;
       }
+      s_min[t] = r_min;
     }
 
 #if p_blockSize>512
     for(int t=0;t<p_blockSize;++t;@inner(0))
       if(t<512) s_min[t] = (s_min[t+512]<s_min[t]) ? s_min[t+512] : s_min[t];
 #endif
-
 #if p_blockSize>256
     for(int t=0;t<p_blockSize;++t;@inner(0))
       if(t<256) s_min[t] = (s_min[t+256]<s_min[t]) ? s_min[t+256] : s_min[t];
 #endif
+    for(int t=0;t<p_blockSize;++t;@inner(0))
+      if(t<128) s_min[t] = (s_min[t+128]<s_min[t]) ? s_min[t+128] : s_min[t];
+
+    for(int t=0;t<p_blockSize;++t;@inner(0))
+      if(t< 64) s_min[t] = (s_min[t+ 64]<s_min[t]) ? s_min[t+ 64] : s_min[t];
+
+    for(int t=0;t<p_blockSize;++t;@inner(0))
+      if(t< 32) s_min[t] = (s_min[t+ 32]<s_min[t]) ? s_min[t+ 32] : s_min[t];
+
+    for(int t=0;t<p_blockSize;++t;@inner(0))
+      if(t< 16) s_min[t] = (s_min[t+ 16]<s_min[t]) ? s_min[t+ 16] : s_min[t];
+
+    for(int t=0;t<p_blockSize;++t;@inner(0))
+      if(t<  8) s_min[t] = (s_min[t+  8]<s_min[t]) ? s_min[t+  8] : s_min[t];
+
+    for(int t=0;t<p_blockSize;++t;@inner(0))
+      if(t<  4) s_min[t] = (s_min[t+  4]<s_min[t]) ? s_min[t+  4] : s_min[t];
+
+    for(int t=0;t<p_blockSize;++t;@inner(0))
+      if(t<  2) s_min[t] = (s_min[t+  2]<s_min[t]) ? s_min[t+  2] : s_min[t];
 
+    for(int t=0;t<p_blockSize;++t;@inner(0))
+      if(t<  1) min[b] = (s_min[0]<s_min[1]) ? s_min[0] : s_min[1];
+  }
+}
+
+@kernel void min2(const dlong Nblocks, @restrict dfloat *min){
+
+
+  for(dlong b=0;b<1;++b;@outer(0)){
+
+    @shared dfloat s_min[p_blockSize];
+
+    for(int t=0;t<p_blockSize;++t;@inner(0)){
+      dlong id = t;
+
+      dfloat r_min = init_dfloat_min;
+      while (id<Nblocks) {
+        r_min = (min[id]<r_min) ? min[id] : r_min;
+        id += p_blockSize;
+      }
+      s_min[t] = r_min;
+    }
+
+#if p_blockSize>512
+    for(int t=0;t<p_blockSize;++t;@inner(0))
+      if(t<512) s_min[t] = (s_min[t+512]<s_min[t]) ? s_min[t+512] : s_min[t];
+#endif
+#if p_blockSize>256
+    for(int t=0;t<p_blockSize;++t;@inner(0))
+      if(t<256) s_min[t] = (s_min[t+256]<s_min[t]) ? s_min[t+256] : s_min[t];
+#endif
     for(int t=0;t<p_blockSize;++t;@inner(0))
       if(t<128) s_min[t] = (s_min[t+128]<s_min[t]) ? s_min[t+128] : s_min[t];
 
diff --git a/libs/linAlg/okl/linAlgNorm2.okl b/libs/linAlg/okl/linAlgNorm2.okl
index e4764b072..6d795e7d2 100644
--- a/libs/linAlg/okl/linAlgNorm2.okl
+++ b/libs/linAlg/okl/linAlgNorm2.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -24,7 +24,7 @@ SOFTWARE.
 
 */
 
-@kernel void norm2(const dlong Nblocks,
+@kernel void norm2_1(const dlong Nblocks,
                    const dlong N,
                   @restrict const  dfloat *x,
                   @restrict        dfloat *norm){
@@ -32,107 +32,65 @@ SOFTWARE.
 
   for(dlong b=0;b<Nblocks;++b;@outer(0)){
 
-    @shared volatile dfloat s_norm[p_blockSize];
+    @shared dfloat s_norm[p_blockSize];
 
     for(int t=0;t<p_blockSize;++t;@inner(0)){
       dlong id = t + b*p_blockSize;
-      s_norm[t] = 0.0;
+
+      dfloat r_norm = 0.0;
       while (id<N) {
-        s_norm[t] += x[id]*x[id];
+        r_norm += x[id]*x[id];
         id += p_blockSize*Nblocks;
       }
+      s_norm[t] = r_norm;
     }
 
-    @barrier("local");
-
 #if p_blockSize>512
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<512) s_norm[t] += s_norm[t+512];
-    @barrier("local");
 #endif
-
 #if p_blockSize>256
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<256) s_norm[t] += s_norm[t+256];
-    @barrier("local");
 #endif
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<128) s_norm[t] += s_norm[t+128];
-    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 64) s_norm[t] += s_norm[t+ 64];
-    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 32) s_norm[t] += s_norm[t+ 32];
-    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 16) s_norm[t] += s_norm[t+ 16];
-    //    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  8) s_norm[t] += s_norm[t+  8];
-    //    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  4) s_norm[t] += s_norm[t+  4];
-    //    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  2) s_norm[t] += s_norm[t+  2];
-    //    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  1) norm[b] = s_norm[0] + s_norm[1];
   }
 }
 
-#if 0
-@kernel void norm2(const dlong N,
-                  @restrict const  dfloat *  w,
-                  @restrict const  dfloat *  x,
-                  @restrict dfloat *  wxx){
+@kernel void norm2_2(const dlong Nblocks, @restrict  dfloat *norm){
 
+  for(dlong b=0;b<1;++b;@outer(0)){
 
-  for(dlong b=0;b<(N+p_blockSize-1)/p_blockSize;++b;@outer(0)){
-
-    @shared volatile dfloat s_wxx[p_blockSize];
+    @shared dfloat s_norm[p_blockSize];
 
     for(int t=0;t<p_blockSize;++t;@inner(0)){
-      const dlong id = t + b*p_blockSize;
-      const dfloat xid = (id<N)?x[id]:0;
-      const dfloat wid = (id<N)?w[id]:0;
-      s_wxx[t] = wid*xid*xid;
+      dlong id = t;
+      dfloat r_norm = 0.0;
+      while (id<Nblocks) {
+        r_norm += norm[id];
+        id += p_blockSize;
+      }
+      s_norm[t] = r_norm;
     }
 
-    @barrier("local");
-
 #if p_blockSize>512
-    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<512) s_wxx[t] += s_wxx[t+512];
-    @barrier("local");
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<512) s_norm[t] += s_norm[t+512];
 #endif
-
 #if p_blockSize>256
-    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<256) s_wxx[t] += s_wxx[t+256];
-    @barrier("local");
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<256) s_norm[t] += s_norm[t+256];
 #endif
-
-    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<128) s_wxx[t] += s_wxx[t+128];
-    @barrier("local");
-
-    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 64) s_wxx[t] += s_wxx[t+ 64];
-    @barrier("local");
-
-    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 32) s_wxx[t] += s_wxx[t+ 32];
-    @barrier("local");
-
-    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 16) s_wxx[t] += s_wxx[t+ 16];
-    //    @barrier("local");
-
-    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  8) s_wxx[t] += s_wxx[t+  8];
-    //    @barrier("local");
-
-    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  4) s_wxx[t] += s_wxx[t+  4];
-    //    @barrier("local");
-
-    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  2) s_wxx[t] += s_wxx[t+  2];
-    //    @barrier("local");
-
-    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  1) wxx[b] = s_wxx[0] + s_wxx[1];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<128) s_norm[t] += s_norm[t+128];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 64) s_norm[t] += s_norm[t+ 64];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 32) s_norm[t] += s_norm[t+ 32];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 16) s_norm[t] += s_norm[t+ 16];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  8) s_norm[t] += s_norm[t+  8];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  4) s_norm[t] += s_norm[t+  4];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  2) s_norm[t] += s_norm[t+  2];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  1) norm[0] = s_norm[0] + s_norm[1];
   }
 }
-
-#endif
diff --git a/libs/linAlg/okl/linAlgScale.okl b/libs/linAlg/okl/linAlgScale.okl
index e1ae23469..e337caecb 100644
--- a/libs/linAlg/okl/linAlgScale.okl
+++ b/libs/linAlg/okl/linAlgScale.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/libs/linAlg/okl/linAlgSet.okl b/libs/linAlg/okl/linAlgSet.okl
index 59e4d18ae..384dcffab 100644
--- a/libs/linAlg/okl/linAlgSet.okl
+++ b/libs/linAlg/okl/linAlgSet.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/libs/linAlg/okl/linAlgSum.okl b/libs/linAlg/okl/linAlgSum.okl
index 7817bed6f..cb6b1e39c 100644
--- a/libs/linAlg/okl/linAlgSum.okl
+++ b/libs/linAlg/okl/linAlgSum.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -24,58 +24,73 @@ SOFTWARE.
 
 */
 
-@kernel void sum(const dlong Nblocks,
-                 const dlong N,
-                 @restrict const  dfloat *x,
-                 @restrict        dfloat *sum){
+@kernel void sum1(const dlong Nblocks,
+                  const dlong N,
+                  @restrict const  dfloat *x,
+                  @restrict        dfloat *sum){
 
 
   for(dlong b=0;b<Nblocks;++b;@outer(0)){
 
-    @shared volatile dfloat s_sum[p_blockSize];
+    @shared dfloat s_sum[p_blockSize];
 
     for(int t=0;t<p_blockSize;++t;@inner(0)){
       dlong id = t + b*p_blockSize;
-      s_sum[t] = 0.0;
+
+      dfloat r_sum = 0.0;
       while (id<N) {
-        s_sum[t] += x[id];
+        r_sum += x[id];
         id += p_blockSize*Nblocks;
       }
+      s_sum[t] = r_sum;
     }
 
-    @barrier("local");
-
 #if p_blockSize>512
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<512) s_sum[t] += s_sum[t+512];
-    @barrier("local");
 #endif
-
 #if p_blockSize>256
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<256) s_sum[t] += s_sum[t+256];
-    @barrier("local");
 #endif
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<128) s_sum[t] += s_sum[t+128];
-    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 64) s_sum[t] += s_sum[t+ 64];
-    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 32) s_sum[t] += s_sum[t+ 32];
-    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 16) s_sum[t] += s_sum[t+ 16];
-    //    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  8) s_sum[t] += s_sum[t+  8];
-    //    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  4) s_sum[t] += s_sum[t+  4];
-    //    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  2) s_sum[t] += s_sum[t+  2];
-    //    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  1) sum[b] = s_sum[0] + s_sum[1];
   }
 }
+
+@kernel void sum2(const dlong Nblocks, @restrict  dfloat *sum){
+
+  for(dlong b=0;b<1;++b;@outer(0)){
+
+    @shared dfloat s_sum[p_blockSize];
+
+    for(int t=0;t<p_blockSize;++t;@inner(0)){
+      dlong id = t;
+      dfloat r_sum = 0.0;
+      while (id<Nblocks) {
+        r_sum += sum[id];
+        id += p_blockSize;
+      }
+      s_sum[t] = r_sum;
+    }
+
+#if p_blockSize>512
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<512) s_sum[t] += s_sum[t+512];
+#endif
+#if p_blockSize>256
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<256) s_sum[t] += s_sum[t+256];
+#endif
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<128) s_sum[t] += s_sum[t+128];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 64) s_sum[t] += s_sum[t+ 64];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 32) s_sum[t] += s_sum[t+ 32];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 16) s_sum[t] += s_sum[t+ 16];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  8) s_sum[t] += s_sum[t+  8];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  4) s_sum[t] += s_sum[t+  4];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  2) s_sum[t] += s_sum[t+  2];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  1) sum[0] = s_sum[0] + s_sum[1];
+  }
+}
diff --git a/libs/linAlg/okl/linAlgWeightedInnerProd.okl b/libs/linAlg/okl/linAlgWeightedInnerProd.okl
index 4e9d5d71f..0b8db4ce2 100644
--- a/libs/linAlg/okl/linAlgWeightedInnerProd.okl
+++ b/libs/linAlg/okl/linAlgWeightedInnerProd.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -24,116 +24,74 @@ SOFTWARE.
 
 */
 
-@kernel void weightedInnerProd(const dlong Nblocks,
-                               const dlong N,
-                               @restrict const  dfloat *w,
-                               @restrict const  dfloat *x,
-                               @restrict const  dfloat *y,
-                               @restrict        dfloat *wxy){
+@kernel void weightedNorm2_1(const dlong Nblocks,
+                             const dlong N,
+                            @restrict const  dfloat *w,
+                            @restrict const  dfloat *x,
+                            @restrict        dfloat *norm){
 
 
   for(dlong b=0;b<Nblocks;++b;@outer(0)){
 
-    @shared volatile dfloat s_wxy[p_blockSize];
+    @shared dfloat s_norm[p_blockSize];
 
     for(int t=0;t<p_blockSize;++t;@inner(0)){
       dlong id = t + b*p_blockSize;
-      s_wxy[t] = 0.0;
+
+      dfloat r_norm = 0.0;
       while (id<N) {
-        s_wxy[t] += w[id]*x[id]*y[id];
+        r_norm += w[id]*x[id]*x[id];
         id += p_blockSize*Nblocks;
       }
+      s_norm[t] = r_norm;
     }
 
-    @barrier("local");
 #if p_blockSize>512
-    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<512) s_wxy[t] += s_wxy[t+512];
-    @barrier("local");
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<512) s_norm[t] += s_norm[t+512];
 #endif
 #if p_blockSize>256
-    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<256) s_wxy[t] += s_wxy[t+256];
-    @barrier("local");
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<256) s_norm[t] += s_norm[t+256];
 #endif
-
-    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<128) s_wxy[t] += s_wxy[t+128];
-    @barrier("local");
-
-    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 64) s_wxy[t] += s_wxy[t+64];
-    @barrier("local");
-
-    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 32) s_wxy[t] += s_wxy[t+32];
-    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 16) s_wxy[t] += s_wxy[t+16];
-    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  8) s_wxy[t] += s_wxy[t+8];
-    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  4) s_wxy[t] += s_wxy[t+4];
-    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  2) s_wxy[t] += s_wxy[t+2];
-
-    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  1) wxy[b] = s_wxy[0] + s_wxy[1];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<128) s_norm[t] += s_norm[t+128];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 64) s_norm[t] += s_norm[t+ 64];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 32) s_norm[t] += s_norm[t+ 32];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 16) s_norm[t] += s_norm[t+ 16];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  8) s_norm[t] += s_norm[t+  8];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  4) s_norm[t] += s_norm[t+  4];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  2) s_norm[t] += s_norm[t+  2];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  1) norm[b] = s_norm[0] + s_norm[1];
   }
 }
 
-#if 0
-// barrier avoiding (partial) reduction
-@kernel void weightedInnerProduct2(const int N,
-                                  @restrict const  dfloat *  w,
-                                  @restrict const  dfloat *  x,
-                                  @restrict const  dfloat *  y,
-                                  @restrict dfloat *  wxy){
-
-#define G 8
-#define S 32
-
-  // loop over blocks
-  for(int b=0;b<(N+G*S-1)/(G*S);++b;@outer(0)){
-
-    @shared volatile dfloat s_a[G][S];
-    @shared volatile dfloat s_b[G];
-
-    for(int g=0;g<G;++g;@inner(1)){
-      for(int s=0;s<S;++s;@inner(0)){
-        const int n = b*G*S + g*S + s;
-
-        const dfloat wgs = (n<N) ? w[n]:0.f;
-        const dfloat xgs = (n<N) ? x[n]:0.f;
-        const dfloat ygs = (n<N) ? y[n]:0.f;
-
-        s_a[g][s] = wgs*xgs*ygs;
-      }
-    }
-
-    @barrier("local");
-
-    for(int g=0;g<G;++g;@inner(1)){
-      for(int s=0;s<S;++s;@inner(0)){
-        const int n = b*G*S + g*S + s;
+@kernel void weightedNorm2_2(const dlong Nblocks, @restrict  dfloat *norm){
 
-        if(s<16) s_a[g][s] += s_a[g][s + 16];
-        if(s< 8) s_a[g][s] += s_a[g][s +  8];
-        if(s< 4) s_a[g][s] += s_a[g][s +  4];
-        if(s< 2) s_a[g][s] += s_a[g][s +  2];
-        if(s==0) s_b[g] = s_a[g][0] + s_a[g][1];
-      }
-    }
+  for(dlong b=0;b<1;++b;@outer(0)){
 
-    @barrier("local");
+    @shared dfloat s_norm[p_blockSize];
 
-    for(int g=0;g<G;++g;@inner(1)){
-      for(int s=0;s<S;++s;@inner(0)){
-        if(g==0){
-          if(s< 4) s_b[s] += s_b[s + 4];
-          if(s< 2) s_b[s] += s_b[s + 2];
-          if(s==0) s_b[s] += s_b[s + 1];
-        }
+    for(int t=0;t<p_blockSize;++t;@inner(0)){
+      dlong id = t;
+      dfloat r_norm = 0.0;
+      while (id<Nblocks) {
+        r_norm += norm[id];
+        id += p_blockSize;
       }
+      s_norm[t] = r_norm;
     }
 
-    @barrier("local");
-
-    for(int g=0;g<G;++g;@inner(1)){
-      for(int s=0;s<S;++s;@inner(0)){
-        if(g==0 && s==0)
-          wxy[b] = s_b[0];
-      }
-    }
+#if p_blockSize>512
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<512) s_norm[t] += s_norm[t+512];
+#endif
+#if p_blockSize>256
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<256) s_norm[t] += s_norm[t+256];
+#endif
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<128) s_norm[t] += s_norm[t+128];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 64) s_norm[t] += s_norm[t+ 64];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 32) s_norm[t] += s_norm[t+ 32];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 16) s_norm[t] += s_norm[t+ 16];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  8) s_norm[t] += s_norm[t+  8];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  4) s_norm[t] += s_norm[t+  4];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  2) s_norm[t] += s_norm[t+  2];
+    for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  1) norm[0] = s_norm[0] + s_norm[1];
   }
 }
-#endif
diff --git a/libs/linAlg/okl/linAlgWeightedNorm2.okl b/libs/linAlg/okl/linAlgWeightedNorm2.okl
index 607dda8ff..3033c3b00 100644
--- a/libs/linAlg/okl/linAlgWeightedNorm2.okl
+++ b/libs/linAlg/okl/linAlgWeightedNorm2.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/libs/linearSolver/initialGuess.cpp b/libs/linearSolver/initialGuess.cpp
index 08256aa2d..f3ddf8393 100644
--- a/libs/linearSolver/initialGuess.cpp
+++ b/libs/linearSolver/initialGuess.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Anthony Austin
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,57 +27,13 @@ SOFTWARE.
 #include "initialGuess.hpp"
 #include "mesh.hpp"
 
-initialGuessSolver_t* initialGuessSolver_t::Setup(dlong N, dlong Nhalo, platform_t& platform, settings_t& settings, MPI_Comm comm)
-{
-  initialGuessSolver_t* initialGuessSolver = new initialGuessSolver_t(N, Nhalo, platform, settings, comm);
-  initialGuessSolver->linearSolver = linearSolver_t::Setup(N, Nhalo, platform, settings, comm);
-  initialGuessSolver->igStrategy = nullptr;
-
-  if (settings.compareSetting("INITIAL GUESS STRATEGY", "NONE")) {
-    initialGuessSolver->igStrategy = new igDefaultStrategy(N, platform, settings, comm);
-  } else if (settings.compareSetting("INITIAL GUESS STRATEGY", "ZERO")) {
-    initialGuessSolver->igStrategy = new igZeroStrategy(N, platform, settings, comm);
-  } else if (settings.compareSetting("INITIAL GUESS STRATEGY", "CLASSIC")) {
-    initialGuessSolver->igStrategy = new igClassicProjectionStrategy(N, platform, settings, comm);
-  } else if (settings.compareSetting("INITIAL GUESS STRATEGY", "QR")) {
-    initialGuessSolver->igStrategy = new igRollingQRProjectionStrategy(N, platform, settings, comm);
-  } else if (settings.compareSetting("INITIAL GUESS STRATEGY", "EXTRAP")) {
-    initialGuessSolver->igStrategy = new igExtrapStrategy(N, platform, settings, comm);
-  } else {
-    LIBP_ABORT("Requested INITIAL GUESS STRATEGY not found.");
-  }
-
-  return initialGuessSolver;
-}
-
-initialGuessSolver_t::initialGuessSolver_t(dlong _N, dlong _Nhalo, platform_t& _platform, settings_t& _settings, MPI_Comm _comm):
-  linearSolver_t(_N, _Nhalo, _platform, _settings, _comm),
-  igStrategy(nullptr),
-  linearSolver(nullptr)
-{
-  return;
-}
+namespace libp {
 
-initialGuessSolver_t::~initialGuessSolver_t()
-{
-  delete igStrategy;
-  delete linearSolver;
-}
-
-int initialGuessSolver_t::Solve(solver_t& solver, precon_t& precon, occa::memory& o_x, occa::memory& o_rhs, const dfloat tol, const int MAXIT, const int verbose)
-{
-  int iter = 0;
+namespace InitialGuess {
 
-  igStrategy->FormInitialGuess(o_x, o_rhs);
-  iter = linearSolver->Solve(solver, precon, o_x, o_rhs, tol, MAXIT, verbose);
-  igStrategy->Update(solver, o_x, o_rhs);
-
-  return iter;
-}
-
-/*****************************************************************************/
+#define IG_BLOCKSIZE 256
 
-void initialGuessAddSettings(settings_t& settings, const string prefix)
+void AddSettings(settings_t& settings, const std::string prefix)
 {
   settings.newSetting(prefix + "INITIAL GUESS STRATEGY",
                       "NONE",
@@ -100,166 +56,116 @@ void initialGuessAddSettings(settings_t& settings, const string prefix)
 
 /*****************************************************************************/
 
-initialGuessStrategy_t::initialGuessStrategy_t(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm):
-  platform(_platform),
-  settings(_settings),
-  comm(_comm),
-  Ntotal(_N)
-{
-  return;
-}
-
-initialGuessStrategy_t::~initialGuessStrategy_t()
-{
-  return;
-}
-
-/*****************************************************************************/
-
-igDefaultStrategy::igDefaultStrategy(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm):
+Default::Default(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm):
   initialGuessStrategy_t(_N, _platform, _settings, _comm)
-{
-  return;
-}
+{}
 
-void igDefaultStrategy::FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs)
-{
-  return;
-}
+void Default::FormInitialGuess(deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs)
+{}
 
-void igDefaultStrategy::Update(solver_t &solver, occa::memory& o_x, occa::memory& o_rhs)
-{
-  return;
-}
+void Default::Update(operator_t &linearOperator, deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs)
+{}
 
 /*****************************************************************************/
 
-igZeroStrategy::igZeroStrategy(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm):
+Zero::Zero(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm):
   initialGuessStrategy_t(_N, _platform, _settings, _comm)
 {
-  platform.linAlg.InitKernels({"set"});
-  return;
+  platform.linAlg().InitKernels({"set"});
 }
 
-void igZeroStrategy::FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs)
+void Zero::FormInitialGuess(deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs)
 {
-  platform.linAlg.set(Ntotal, 0.0, o_x);
-  return;
+  platform.linAlg().set(Ntotal, 0.0, o_x);
 }
 
-void igZeroStrategy::Update(solver_t &solver, occa::memory& o_x, occa::memory& o_rhs)
-{
-  return;
-}
+void Zero::Update(operator_t &linearOperator, deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs)
+{}
 
 /*****************************************************************************/
 
-igProjectionStrategy::igProjectionStrategy(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm):
+Projection::Projection(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm):
   initialGuessStrategy_t(_N, _platform, _settings, _comm)
 {
   curDim = 0;
   settings.getSetting("INITIAL GUESS HISTORY SPACE DIMENSION", maxDim);
 
-  o_btilde = platform.malloc(Ntotal*sizeof(dfloat));
-  o_xtilde = platform.malloc(Ntotal*sizeof(dfloat));
-  o_Btilde = platform.malloc(Ntotal*maxDim*sizeof(dfloat));
-  o_Xtilde = platform.malloc(Ntotal*maxDim*sizeof(dfloat));
+  o_btilde = platform.malloc<dfloat>(Ntotal);
+  o_xtilde = platform.malloc<dfloat>(Ntotal);
+  o_Btilde = platform.malloc<dfloat>(Ntotal*maxDim);
+  o_Xtilde = platform.malloc<dfloat>(Ntotal*maxDim);
 
-  alphas = new dfloat[maxDim]();
-  alphasThisRank = new dfloat[maxDim]();
-  o_alphas = platform.malloc(maxDim*sizeof(dfloat));
+  alphas = platform.hostMalloc<dfloat>(maxDim);
+  o_alphas = platform.malloc<dfloat>(maxDim);
 
-  ctmpNblocks = (Ntotal + BLOCKSIZE - 1)/BLOCKSIZE;
-  ctmp = (dfloat*)calloc(ctmpNblocks*maxDim, sizeof(dfloat));
-  o_ctmp = platform.malloc(ctmpNblocks*maxDim*sizeof(dfloat), ctmp);
+  ctmpNblocks = (Ntotal + IG_BLOCKSIZE - 1)/IG_BLOCKSIZE;
+  ctmp = platform.hostMalloc<dfloat>(ctmpNblocks*maxDim);
+  o_ctmp = platform.malloc<dfloat>(ctmpNblocks*maxDim);
 
   // Build kernels.
-  platform.linAlg.InitKernels({"set"});
+  platform.linAlg().InitKernels({"set"});
 
-  occa::properties kernelInfo = platform.props;
+  properties_t kernelInfo = platform.props();
   kernelInfo["defines/" "p_igNhist"] = maxDim;
 
   igBasisInnerProductsKernel = platform.buildKernel(LINEARSOLVER_DIR "/okl/igBasisInnerProducts.okl", "igBasisInnerProducts", kernelInfo);
   igReconstructKernel        = platform.buildKernel(LINEARSOLVER_DIR "/okl/igReconstruct.okl",        "igReconstruct",        kernelInfo);
   igScaleKernel              = platform.buildKernel(LINEARSOLVER_DIR "/okl/igScale.okl",              "igScale",              kernelInfo);
   igUpdateKernel             = platform.buildKernel(LINEARSOLVER_DIR "/okl/igUpdate.okl",             "igUpdate",             kernelInfo);
-
-  return;
-}
-
-igProjectionStrategy::~igProjectionStrategy()
-{
-  if (ctmp)
-    delete[] ctmp;
-  if (alphas)
-    delete[] alphas;
-  if (alphasThisRank)
-    delete[] alphasThisRank;
-
-  return;
 }
 
-void igProjectionStrategy::FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs)
+void Projection::FormInitialGuess(deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs)
 {
   if (curDim > 0) {
-    igBasisInnerProducts(o_rhs, o_Btilde, o_alphas, alphas, alphasThisRank);
-    platform.linAlg.set(Ntotal, 0.0, o_x);
+    igBasisInnerProducts(o_rhs, o_Btilde, o_alphas, alphas);
+    platform.linAlg().set(Ntotal, 0.0, o_x);
     igReconstruct(o_x, 1.0, o_alphas, o_Xtilde, o_x);
   }
-
-  return;
 }
 
-void igProjectionStrategy::igBasisInnerProducts(occa::memory& o_x, occa::memory& o_Q, occa::memory& o_c, dfloat *c, dfloat *cThisRank)
+void Projection::igBasisInnerProducts(deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_c, pinnedMemory<dfloat>& c)
 {
   igBasisInnerProductsKernel(Ntotal, ctmpNblocks, curDim, o_x, o_Q, o_ctmp);
 
-  o_ctmp.copyTo(ctmp, ctmpNblocks*curDim*sizeof(dfloat));
+  ctmp.copyFrom(o_ctmp, ctmpNblocks*curDim);
 
-  dlong cnt = 0;
   for (int m = 0; m < curDim; ++m) {
-    cThisRank[m] = 0;
+    c[m] = 0;
     for (int n = 0; n < ctmpNblocks; ++n) {
-      cThisRank[m] += ctmp[cnt];
-      ++cnt;
+      c[m] += ctmp[m*ctmpNblocks + n];
     }
   }
 
-  MPI_Allreduce(cThisRank, c, curDim, MPI_DFLOAT, MPI_SUM, comm);
-  o_c.copyFrom(c, curDim*sizeof(dfloat));
-
-  return;
+  comm.Allreduce(c, Comm::Sum, curDim);
+  c.copyTo(o_c, curDim);
 }
 
-void igProjectionStrategy::igReconstruct(occa::memory& o_u, dfloat a, occa::memory& o_c, occa::memory& o_Q, occa::memory& o_unew)
+void Projection::igReconstruct(deviceMemory<dfloat>& o_u, dfloat a, deviceMemory<dfloat>& o_c, deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_unew)
 {
   igReconstructKernel(Ntotal, curDim, o_u, a, o_c, o_Q, o_unew);
-  return;
 }
 
 
 /*****************************************************************************/
 
-igClassicProjectionStrategy::igClassicProjectionStrategy(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm):
-  igProjectionStrategy(_N, _platform, _settings, _comm)
-{
-  return;
-}
+ClassicProjection::ClassicProjection(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm):
+  Projection(_N, _platform, _settings, _comm)
+{}
 
-void igClassicProjectionStrategy::Update(solver_t &solver, occa::memory& o_x, occa::memory& o_rhs)
+void ClassicProjection::Update(operator_t &linearOperator, deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs)
 {
   // Compute RHS corresponding to the approximate solution obtained.
-  solver.Operator(o_x, o_btilde);
+  linearOperator.Operator(o_x, o_btilde);
 
   // Insert new solution into the initial guess space.
   if ((curDim >= maxDim) || (curDim == 0)) {
     dfloat normbtilde = 0.0;
 
-    normbtilde = platform.linAlg.norm2(Ntotal,  o_btilde, comm);
+    normbtilde = platform.linAlg().norm2(Ntotal, o_btilde, comm);
 
     if (normbtilde > 0) {
-      igScaleKernel(Ntotal, 1.0/normbtilde, o_btilde, o_Btilde);
-      igScaleKernel(Ntotal, 1.0/normbtilde, o_x,      o_Xtilde);
+      igScaleKernel(Ntotal, dfloat(1.0)/normbtilde, o_btilde, o_Btilde);
+      igScaleKernel(Ntotal, dfloat(1.0)/normbtilde, o_x,      o_Xtilde);
 
       curDim = 1;
     }
@@ -267,17 +173,17 @@ void igClassicProjectionStrategy::Update(solver_t &solver, occa::memory& o_x, oc
     dfloat    invnormbtilde = 0.0;
     const int Nreorth = 2;
 
-    o_x.copyTo(o_xtilde, Ntotal*sizeof(dfloat));
+    o_x.copyTo(o_xtilde, Ntotal);
 
     // Orthogonalize new RHS against previous ones.
     for (int n = 0; n < Nreorth; n++) {
-      igBasisInnerProducts(o_btilde, o_Btilde, o_alphas, alphas, alphasThisRank);
-      igReconstruct(o_btilde, (dfloat)(-1.0), o_alphas, o_Btilde, o_btilde);
-      igReconstruct(o_xtilde, (dfloat)(-1.0), o_alphas, o_Xtilde, o_xtilde);
+      igBasisInnerProducts(o_btilde, o_Btilde, o_alphas, alphas);
+      igReconstruct(o_btilde, -1.0, o_alphas, o_Btilde, o_btilde);
+      igReconstruct(o_xtilde, -1.0, o_alphas, o_Xtilde, o_xtilde);
     }
 
     // Normalize.
-    invnormbtilde = platform.linAlg.norm2(Ntotal, o_btilde, comm);
+    invnormbtilde = platform.linAlg().norm2(Ntotal, o_btilde, comm);
     invnormbtilde = 1.0/invnormbtilde;
 
 #if 0
@@ -293,38 +199,26 @@ void igClassicProjectionStrategy::Update(solver_t &solver, occa::memory& o_x, oc
 
     curDim++;
   }
-
-  return;
 }
 
 /*****************************************************************************/
 
-igRollingQRProjectionStrategy::igRollingQRProjectionStrategy(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm):
-  igProjectionStrategy(_N, _platform, _settings, _comm)
+RollingQRProjection::RollingQRProjection(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm):
+  Projection(_N, _platform, _settings, _comm)
 {
-  R = new dfloat[maxDim*maxDim]();
-  o_R = platform.malloc(maxDim*maxDim*sizeof(dfloat));
+  R = platform.hostMalloc<dfloat>(maxDim*maxDim);
+  o_R = platform.malloc<dfloat>(maxDim*maxDim);
 
-  occa::properties kernelInfo = platform.props;
+  properties_t kernelInfo = platform.props();
   kernelInfo["defines/" "p_igNhist"] = maxDim;
 
   igDropQRFirstColumnKernel = platform.buildKernel(LINEARSOLVER_DIR "/okl/igDropQRFirstColumn.okl", "igDropQRFirstColumn", kernelInfo);
-
-  return;
 }
 
-igRollingQRProjectionStrategy::~igRollingQRProjectionStrategy()
-{
-  if (R)
-    delete[] R;
-
-  return;
-}
-
-void igRollingQRProjectionStrategy::Update(solver_t &solver, occa::memory& o_x, occa::memory& o_rhs)
+void RollingQRProjection::Update(operator_t &linearOperator, deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs)
 {
   // Compute RHS corresponding to the approximate solution obtained.
-  solver.Operator(o_x, o_btilde);
+  linearOperator.Operator(o_x, o_btilde);
 
   // Rotate the history space (QR update).
   if (curDim == maxDim) {
@@ -335,7 +229,7 @@ void igRollingQRProjectionStrategy::Update(solver_t &solver, occa::memory& o_x,
       R[j*maxDim + (maxDim - 1)] = 0.0;
     }
 
-    o_R.copyFrom(R);
+    R.copyTo(o_R);
 
     // Update the RHS and solution spaces.
 		igDropQRFirstColumnKernel(Ntotal, o_Btilde, o_Xtilde, o_R);
@@ -346,7 +240,7 @@ void igRollingQRProjectionStrategy::Update(solver_t &solver, occa::memory& o_x,
       dfloat Rjj   = R[j*maxDim + j];
       dfloat Rjp1j = R[(j + 1)*maxDim + j];
 
-      givensRotation(Rjj, Rjp1j, &c, &s);
+      givensRotation(Rjj, Rjp1j, c, s);
 
       for (int i = j; i < maxDim; i++) {
         dfloat Rji   = R[j*maxDim + i];
@@ -358,8 +252,8 @@ void igRollingQRProjectionStrategy::Update(solver_t &solver, occa::memory& o_x,
     }
 
     // Copy the updated R back to the device.
-    platform.device.finish();
-    o_R.copyFrom(R);
+    platform.finish();
+    R.copyTo(o_R);
 
     curDim--;
   }
@@ -368,7 +262,7 @@ void igRollingQRProjectionStrategy::Update(solver_t &solver, occa::memory& o_x,
   if (curDim == 0) {
     dfloat normbtilde = 0.0;
 
-    normbtilde = platform.linAlg.norm2(Ntotal, o_btilde, comm);
+    normbtilde = platform.linAlg().norm2(Ntotal, o_btilde, comm);
 
     if (normbtilde > 0) {
 #if 0
@@ -387,10 +281,10 @@ void igRollingQRProjectionStrategy::Update(solver_t &solver, occa::memory& o_x,
     dfloat    normbtilde = 0.0, normbtildeproj = 0.0;;
     const int Nreorth = 2;
 
-    o_x.copyTo(o_xtilde, Ntotal*sizeof(dfloat));
+    o_x.copyTo(o_xtilde, Ntotal);
 
     // Compute the initial norm of the new vector.
-    normbtilde = platform.linAlg.norm2(Ntotal, o_btilde, comm);
+    normbtilde = platform.linAlg().norm2(Ntotal, o_btilde, comm);
 
     // Zero the entries above/on the diagonal of the column of R into which we want to write.
     for (int i = 0; i < curDim; i++)
@@ -398,7 +292,7 @@ void igRollingQRProjectionStrategy::Update(solver_t &solver, occa::memory& o_x,
 
     // Orthogonalize new RHS against previous ones.
     for (int n = 0; n < Nreorth; n++) {
-      igBasisInnerProducts(o_btilde, o_Btilde, o_alphas, alphas, alphasThisRank);
+      igBasisInnerProducts(o_btilde, o_Btilde, o_alphas, alphas);
       igReconstruct(o_btilde, (dfloat)(-1.0), o_alphas, o_Btilde, o_btilde);
       igReconstruct(o_xtilde, (dfloat)(-1.0), o_alphas, o_Xtilde, o_xtilde);
 
@@ -407,7 +301,7 @@ void igRollingQRProjectionStrategy::Update(solver_t &solver, occa::memory& o_x,
     }
 
     // Normalize.
-    normbtildeproj = platform.linAlg.norm2(Ntotal, o_btilde, comm);
+    normbtildeproj = platform.linAlg().norm2(Ntotal, o_btilde, comm);
 
     // Only add if the remainder after projection is large enough.
     //
@@ -431,63 +325,57 @@ void igRollingQRProjectionStrategy::Update(solver_t &solver, occa::memory& o_x,
     }
   }
 
-  o_R.copyFrom(R);
+  R.copyTo(o_R);
 }
 
-void igRollingQRProjectionStrategy::givensRotation(dfloat a, dfloat b, dfloat *c, dfloat *s)
+void RollingQRProjection::givensRotation(dfloat a, dfloat b, dfloat& c, dfloat& s)
 {
 	// Compute a Givens rotation that zeros the bottom component of [a ; b].
   if (b != 0) {
     dfloat h = hypot(a, b);
     dfloat d = 1.0/h;
-    *c = fabs(a)*d;
-    *s = copysign(d, a)*b;
+    c = std::abs(a)*d;
+    s = std::copysign(d, a)*b;
   } else {
-    *c = 1.0;
-    *s = 0.0;
+    c = 1.0;
+    s = 0.0;
   }
-
-  return;
 }
 
 /*****************************************************************************/
 
-igExtrapStrategy::igExtrapStrategy(dlong _N, platform_t& _platform, settings_t& _settings, MPI_Comm _comm):
+Extrap::Extrap(dlong _N, platform_t& _platform, settings_t& _settings, comm_t _comm):
   initialGuessStrategy_t(_N, _platform, _settings, _comm)
 {
   int M, m;
   settings.getSetting("INITIAL GUESS HISTORY SPACE DIMENSION", M);
   settings.getSetting("INITIAL GUESS EXTRAP DEGREE", m);
 
-  dfloat *c = new dfloat[M]();
+  memory<dfloat> c(M);
   extrapCoeffs(m, M, c);
 
   Nhistory = M;
 
   entry = 0;
 
-  o_coeffs = platform.malloc(Nhistory*sizeof(dfloat), c);
+  o_coeffs = platform.malloc<dfloat>(Nhistory, c);
 
   shift = 0;
 
-  o_xh = platform.malloc(Nhistory*Ntotal*sizeof(dfloat));
+  o_xh = platform.malloc<dfloat>(Nhistory*Ntotal);
 
-  platform.linAlg.InitKernels({"set"});
+  platform.linAlg().InitKernels({"set"});
 
-  occa::properties kernelInfo = platform.props;
+  properties_t kernelInfo = platform.props();
   kernelInfo["defines/" "p_igNhist"] = Nhistory;
 
   igExtrapKernel       = platform.buildKernel(LINEARSOLVER_DIR "/okl/igExtrap.okl",       "igExtrap",   kernelInfo);
   igExtrapSparseKernel = platform.buildKernel(LINEARSOLVER_DIR "/okl/igExtrap.okl", "igExtrapSparse",   kernelInfo);
 
-  platform.linAlg.set(Nhistory*Ntotal, 0.0, o_xh);
-
-  delete[] c;
-
-  return;
+  platform.linAlg().set(Nhistory*Ntotal, 0.0, o_xh);
 }
 
-void igExtrapStrategy::FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs)
+void Extrap::FormInitialGuess(deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs)
 {
   if (entry < Nhistory) {
     int M, m;
@@ -495,16 +383,14 @@ void igExtrapStrategy::FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs)
       settings.getSetting("INITIAL GUESS HISTORY SPACE DIMENSION", M);
       settings.getSetting("INITIAL GUESS EXTRAP DEGREE", m);
     } else {
-      M = mymax(1, entry + 1);
-      m = sqrt((double)M);
+      M = std::max(1, entry + 1);
+      m = sqrt(static_cast<double>(M));
     }
 
     // Construct the extrapolation coefficients.
-    dfloat *c, *d, *sparseCoeffs;
-
-    c = new dfloat[Nhistory]();
-    d = new dfloat[Nhistory]();
-    sparseCoeffs = new dfloat[Nhistory]();
+    memory<dfloat> c(Nhistory);
+    memory<dfloat> d(Nhistory);
+    memory<dfloat> sparseCoeffs(Nhistory);
     for (int n = 0; n < Nhistory; ++n) {
       c[n] = 0;
       d[n] = 0;
@@ -512,7 +398,7 @@ void igExtrapStrategy::FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs)
     }
 
     if (M == 1) {
-        d[Nhistory - 1] = 1.0;
+      d[Nhistory - 1] = 1.0;
     } else {
       extrapCoeffs(m, M, c);
 
@@ -521,23 +407,21 @@ void igExtrapStrategy::FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs)
         d[Nhistory - M + i] = c[i];
     }
 
-    int *sparseIds = new int[Nhistory]();
+    memory<int> sparseIds(Nhistory);
     Nsparse = 0;
     for (int n = 0; n < Nhistory; ++n) {
-      if (fabs(d[n]) > 1e-14) { // hmm
+      if (std::abs(d[n]) > 1e-14) { // hmm
         sparseIds[Nsparse] = n;
         sparseCoeffs[Nsparse] = d[n];
         ++Nsparse;
       }
     }
 
-    o_coeffs = platform.malloc(Nhistory*sizeof(dfloat), d);
-    o_sparseIds = platform.malloc(Nhistory*sizeof(int), sparseIds);
-    o_sparseCoeffs = platform.malloc(Nhistory*sizeof(dfloat), sparseCoeffs);
+    o_coeffs = platform.malloc<dfloat>(d);
+    o_sparseIds = platform.malloc<int>(sparseIds);
+    o_sparseCoeffs = platform.malloc<dfloat>(sparseCoeffs);
 
     ++entry;
-
-    delete[] sparseIds;
   }
 
   if (settings.compareSetting("INITIAL GUESS EXTRAP COEFFS METHOD", "MINNORM"))
@@ -545,50 +429,41 @@ void igExtrapStrategy::FormInitialGuess(occa::memory& o_x, occa::memory& o_rhs)
   else {
     igExtrapSparseKernel(Ntotal, Nhistory, shift, Nsparse, o_sparseIds, o_sparseCoeffs, o_xh, o_x);
   }
-
-  return;
 }
 
-void igExtrapStrategy::Update(solver_t &solver, occa::memory& o_x, occa::memory& o_rhs)
+void Extrap::Update(operator_t &linearOperator, deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_rhs)
 {
-  occa::memory o_tmp = o_xh + Ntotal*shift*sizeof(dfloat);
-  o_x.copyTo(o_tmp, Ntotal*sizeof(dfloat));
+  deviceMemory<dfloat> o_tmp = o_xh + Ntotal*shift;
+  o_x.copyTo(o_tmp, Ntotal);
   shift = (shift + 1) % Nhistory;
-
-  return;
 }
 
-void igExtrapStrategy::extrapCoeffs(int m, int M, dfloat *c)
+void Extrap::extrapCoeffs(int m, int M, memory<dfloat> c)
 {
-  dfloat h, ro, *r, *V, *b;
-
-  if (M < m + 1) {
-    std::stringstream ss;
-    ss << "Extrapolation space dimension (" << M << ") too low for degree (" << m << ").";
-    LIBP_ABORT(ss.str());
-  }
+  LIBP_ABORT("Extrapolation space dimension (" << M << ") too low for degree (" << m << ").",
+             M < m + 1);
 
-  h = 2.0/(M - 1);
-  r = new dfloat[M]();
+  const dfloat h = 2.0/(M - 1);
+  memory<dfloat> r(M);
   for (int i = 0; i < M; i++)
     r[i] = -1.0 + i*h;
-  ro = 1.0 + h;  // Evaluation point.
 
-  V = new dfloat[(m + 1)*M]();
-  mesh_t::Vandermonde1D(m, M, r, V);
+  memory<dfloat> ro(1);
+  ro[0] = 1.0 + h;  // Evaluation point.
 
-  b = new dfloat[m + 1]();
-  mesh_t::Vandermonde1D(m, 1, &ro, b);
+  memory<dfloat> V;
+  mesh_t::Vandermonde1D(m, r, V);
+
+  memory<dfloat> b;
+  mesh_t::Vandermonde1D(m, ro, b);
 
   if (settings.compareSetting("INITIAL GUESS EXTRAP COEFFS METHOD", "MINNORM")) {
-    matrixUnderdeterminedRightSolveMinNorm(M, m + 1, V, b, c);
+    linAlg_t::matrixUnderdeterminedRightSolveMinNorm(M, m + 1, V, b, c);
   } else if (settings.compareSetting("INITIAL GUESS EXTRAP COEFFS METHOD", "CPQR")) {
-    matrixUnderdeterminedRightSolveCPQR(M, m + 1, V, b, c);
+    linAlg_t::matrixUnderdeterminedRightSolveCPQR(M, m + 1, V, b, c);
   }
+}
 
-  delete[] r;
-  delete[] V;
-  delete[] b;
+} //namespace InitialGuess
 
-  return;
-}
+} //namespace libp
diff --git a/libs/linearSolver/linearSolver.cpp b/libs/linearSolver/linearSolver.cpp
index ac60163de..50289ccad 100644
--- a/libs/linearSolver/linearSolver.cpp
+++ b/libs/linearSolver/linearSolver.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Anthony Austin
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,25 +26,33 @@ SOFTWARE.
 
 #include "linearSolver.hpp"
 
-//virtual base linear solver class
-linearSolver_t* linearSolver_t::Setup(dlong N, dlong Nhalo,
-                               platform_t& platform, settings_t& settings, MPI_Comm comm) {
-
-  linearSolver_t *linearSolver=NULL;
-
-  if (settings.compareSetting("LINEAR SOLVER","NBPCG")){
-    linearSolver = new nbpcg(N, Nhalo, platform, settings, comm);
-  } else if (settings.compareSetting("LINEAR SOLVER","NBFPCG")){
-    linearSolver = new nbfpcg(N, Nhalo, platform, settings, comm);
-  } else if (settings.compareSetting("LINEAR SOLVER","PCG")){
-    linearSolver = new pcg(N, Nhalo, platform, settings, comm);
-  } else if (settings.compareSetting("LINEAR SOLVER","PGMRES")){
-    linearSolver = new pgmres(N, Nhalo, platform, settings, comm);
-  } else if (settings.compareSetting("LINEAR SOLVER","PMINRES")){
-    linearSolver = new pminres(N, Nhalo, platform, settings, comm);
-  } else {
-    LIBP_ABORT(string("Requested LINEAR SOLVER not found."));
-  }
-
-  return linearSolver;
+namespace libp {
+
+int linearSolver_t::Solve(operator_t& linearOperator,
+                          operator_t& precon,
+                          deviceMemory<dfloat>& o_x,
+                          deviceMemory<dfloat>& o_rhs,
+                          const dfloat tol,
+                          const int MAXIT,
+                          const int verbose) {
+  assertInitialized();
+  ig->FormInitialGuess(o_x, o_rhs);
+  int iters = ls->Solve(linearOperator, precon, o_x, o_rhs, tol, MAXIT, verbose);
+  ig->Update(linearOperator, o_x, o_rhs);
+
+  return iters;
 }
+
+void linearSolver_t::MakeDefaultInitialGuessStrategy() {
+  ig = std::make_shared<InitialGuess::Default>(ls->N, ls->platform,
+                                               ls->settings, ls->comm);
+}
+
+void linearSolver_t::assertInitialized() {
+  LIBP_ABORT("LinearSolver not initialized",
+             ls==nullptr);
+  LIBP_ABORT("InitialGuess not initialized",
+             ig==nullptr);
+}
+
+} //namespace libp
diff --git a/libs/linearSolver/linearSolverNBFPCG.cpp b/libs/linearSolver/linearSolverNBFPCG.cpp
index 3bc8c63a8..20ae83ba8 100644
--- a/libs/linearSolver/linearSolverNBFPCG.cpp
+++ b/libs/linearSolver/linearSolverNBFPCG.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,36 +26,37 @@ SOFTWARE.
 
 #include "linearSolver.hpp"
 
+namespace libp {
+
+namespace LinearSolver {
+
 #define NBFPCG_BLOCKSIZE 512
 
 nbfpcg::nbfpcg(dlong _N, dlong _Nhalo,
-         platform_t& _platform, settings_t& _settings, MPI_Comm _comm):
-  linearSolver_t(_N, _Nhalo, _platform, _settings, _comm) {
+         platform_t& _platform, settings_t& _settings, comm_t _comm):
+  linearSolverBase_t(_N, _Nhalo, _platform, _settings, _comm) {
+
+  platform.linAlg().InitKernels({"axpy", "zaxpy"});
 
   dlong Ntotal = N + Nhalo;
 
   /*aux variables */
-  o_u  = platform.malloc(Ntotal*sizeof(dfloat));
-  o_p  = platform.malloc(Ntotal*sizeof(dfloat));
-  o_w  = platform.malloc(Ntotal*sizeof(dfloat));
-  o_n  = platform.malloc(Ntotal*sizeof(dfloat));
-  o_m  = platform.malloc(Ntotal*sizeof(dfloat));
-  o_s  = platform.malloc(Ntotal*sizeof(dfloat));
-  o_z  = platform.malloc(Ntotal*sizeof(dfloat));
-  o_q  = platform.malloc(Ntotal*sizeof(dfloat));
-  o_Ax = platform.malloc(Ntotal*sizeof(dfloat));
-
-  localdots  = (dfloat*) calloc(4, sizeof(dfloat));
-  globaldots = (dfloat*) calloc(4, sizeof(dfloat));
+  o_u  = platform.malloc<dfloat>(Ntotal);
+  o_p  = platform.malloc<dfloat>(Ntotal);
+  o_w  = platform.malloc<dfloat>(Ntotal);
+  o_n  = platform.malloc<dfloat>(Ntotal);
+  o_m  = platform.malloc<dfloat>(Ntotal);
+  o_s  = platform.malloc<dfloat>(Ntotal);
+  o_z  = platform.malloc<dfloat>(Ntotal);
+  o_q  = platform.malloc<dfloat>(Ntotal);
+  o_Ax = platform.malloc<dfloat>(Ntotal);
 
   //pinned tmp buffer for reductions
-  tmpdots = (dfloat*) platform.hostMalloc(4*NBFPCG_BLOCKSIZE*sizeof(dfloat),
-                                          NULL, h_tmpdots);
-
-  o_tmpdots = platform.malloc(4*NBFPCG_BLOCKSIZE*sizeof(dfloat));
+  dots = platform.hostMalloc<dfloat>(4*NBFPCG_BLOCKSIZE);
+  o_dots = platform.malloc<dfloat>(4*NBFPCG_BLOCKSIZE);
 
   /* build kernels */
-  occa::properties kernelInfo = platform.props; //copy base properties
+  properties_t kernelInfo = platform.props(); //copy base properties
 
   //add defines
   kernelInfo["defines/" "p_blockSize"] = (int)NBFPCG_BLOCKSIZE;
@@ -67,13 +68,12 @@ nbfpcg::nbfpcg(dlong _N, dlong _Nhalo,
                                 "update1NBFPCG", kernelInfo);
 }
 
-int nbfpcg::Solve(solver_t& solver, precon_t& precon,
-                  occa::memory &o_x, occa::memory &o_r,
+int nbfpcg::Solve(operator_t& linearOperator, operator_t& precon,
+                  deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_r,
                   const dfloat tol, const int MAXIT, const int verbose) {
 
-  int rank;
-  MPI_Comm_rank(comm, &rank);
-  linAlg_t &linAlg = platform.linAlg;
+  int rank = comm.rank();
+  linAlg_t &linAlg = platform.linAlg();
 
   // register scalars
   dfloat alpha0 = 0;
@@ -84,7 +84,7 @@ int nbfpcg::Solve(solver_t& solver, precon_t& precon,
   dfloat rdotr0 = 0;
 
   // compute A*x
-  solver.Operator(o_x, o_Ax);
+  linearOperator.Operator(o_x, o_Ax);
 
   // subtract r = r - A*x
   linAlg.axpy(N, -1.f, o_Ax, 1.f, o_r);
@@ -96,7 +96,7 @@ int nbfpcg::Solve(solver_t& solver, precon_t& precon,
   o_p.copyFrom(o_u);
 
   // w = A*p
-  solver.Operator(o_p, o_w);
+  linearOperator.Operator(o_p, o_w);
 
   // gamma = u.r
   // delta = u.w
@@ -104,19 +104,19 @@ int nbfpcg::Solve(solver_t& solver, precon_t& precon,
 
   precon.Operator(o_w, o_m);
 
-  solver.Operator(o_m, o_n);
+  linearOperator.Operator(o_m, o_n);
   o_s.copyFrom(o_w);
   o_q.copyFrom(o_m);
   o_z.copyFrom(o_n);
 
-  MPI_Wait(&request, &status);
-  gamma0 = globaldots[0]; // udotr
-  delta0 = globaldots[1]; // udotw
-  rdotr0 = globaldots[2]; // rdotr
+  comm.Wait(request);
+  gamma0 = dots[0]; // udotr
+  delta0 = dots[1]; // udotw
+  rdotr0 = dots[2]; // rdotr
   eta0   = delta0;
   alpha0 = gamma0/eta0;
 
-  dfloat TOL = mymax(tol*tol*rdotr0,tol*tol);
+  dfloat TOL = std::max(tol*tol*rdotr0,tol*tol);
 
   if (verbose&&(rank==0))
     printf("NBFPCG: initial res norm %12.12f \n", sqrt(rdotr0));
@@ -147,14 +147,14 @@ int nbfpcg::Solve(solver_t& solver, precon_t& precon,
     linAlg.axpy(N, 1.0, o_u, 1.0, o_m);
 
     // n = A*m
-    solver.Operator(o_m, o_n);
+    linearOperator.Operator(o_m, o_n);
 
     // block for delta
-    MPI_Wait(&request, &status);
-    gamma0 = globaldots[0];       //  u.r
-    beta0  = -globaldots[1]/eta0; // -u.s/eta
-    delta0 = globaldots[2];       //  u.w
-    rdotr0 = globaldots[3];       // r.r
+    comm.Wait(request);
+    gamma0 = dots[0];       //  u.r
+    beta0  = -dots[1]/eta0; // -u.s/eta
+    delta0 = dots[2];       //  u.w
+    rdotr0 = dots[3];       // r.r
 
     //  p <= u + beta*p
     linAlg.axpy(N, 1.0, o_u, beta0, o_p);
@@ -187,64 +187,60 @@ int nbfpcg::Solve(solver_t& solver, precon_t& precon,
   return iter;
 }
 
-void nbfpcg::Update0NBFPCG(occa::memory &o_r){
+void nbfpcg::Update0NBFPCG(deviceMemory<dfloat>& o_r){
 
   // (u.r)
   // (u.w)
   // (r.r)
   int Nblocks = (N+NBFPCG_BLOCKSIZE-1)/NBFPCG_BLOCKSIZE;
-  Nblocks = (Nblocks>NBFPCG_BLOCKSIZE) ? NBFPCG_BLOCKSIZE : Nblocks; //limit to NBFPCG_BLOCKSIZE entries
-
-  update0NBFPCGKernel(N, Nblocks, o_u, o_r, o_w, o_tmpdots);
+  Nblocks = std::min(Nblocks, NBFPCG_BLOCKSIZE); //limit to NBFPCG_BLOCKSIZE entries
 
-  o_tmpdots.copyTo(tmpdots, 3*Nblocks*sizeof(dfloat));
+  update0NBFPCGKernel(N, Nblocks, o_u, o_r, o_w, o_dots);
 
-  localdots[0] = 0;
-  localdots[1] = 0;
-  localdots[2] = 0;
-  for(int n=0;n<Nblocks;++n) {
-    localdots[0] += tmpdots[0+3*n];
-    localdots[1] += tmpdots[1+3*n];
-    localdots[2] += tmpdots[2+3*n];
+  if (Nblocks>0) {
+    dots.copyFrom(o_dots, 3*Nblocks);
+  } else {
+    dots[0] = 0.0;
+    dots[1] = 0.0;
+    dots[2] = 0.0;
   }
 
-  globaldots[0] = 0;
-  globaldots[1] = 0;
-  globaldots[2] = 0;
-  MPI_Iallreduce(localdots, globaldots, 3, MPI_DFLOAT, MPI_SUM, comm, &request);
+  for(int n=1;n<Nblocks;++n) {
+    dots[0] += dots[0+3*n];
+    dots[1] += dots[1+3*n];
+    dots[2] += dots[2+3*n];
+  }
+  comm.Iallreduce(dots, Comm::Sum, 3, request);
 }
 
-void nbfpcg::Update1NBFPCG(const dfloat alpha, occa::memory &o_x, occa::memory &o_r){
+void nbfpcg::Update1NBFPCG(const dfloat alpha, deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_r){
 
   // p <= z + beta*p
   // s <= Z + beta*s
   // dot(p,s)
   int Nblocks = (N+NBFPCG_BLOCKSIZE-1)/NBFPCG_BLOCKSIZE;
-  Nblocks = (Nblocks>NBFPCG_BLOCKSIZE) ? NBFPCG_BLOCKSIZE : Nblocks; //limit to NBFPCG_BLOCKSIZE entries
-
-  update1NBFPCGKernel(N, Nblocks, o_p, o_s, o_q, o_z, alpha, o_x, o_r, o_u, o_w, o_tmpdots);
+  Nblocks = std::min(Nblocks,NBFPCG_BLOCKSIZE); //limit to NBFPCG_BLOCKSIZE entries
 
-  o_tmpdots.copyTo(tmpdots, 4*Nblocks*sizeof(dfloat));
+  update1NBFPCGKernel(N, Nblocks, o_p, o_s, o_q, o_z, alpha, o_x, o_r, o_u, o_w, o_dots);
 
-  localdots[0] = 0;
-  localdots[1] = 0;
-  localdots[2] = 0;
-  localdots[3] = 0;
-  for(int n=0;n<Nblocks;++n) {
-    localdots[0] += tmpdots[0+4*n];
-    localdots[1] += tmpdots[1+4*n];
-    localdots[2] += tmpdots[2+4*n];
-    localdots[3] += tmpdots[3+4*n];
+  if (Nblocks>0) {
+    dots.copyFrom(o_dots, 4*Nblocks);
+  } else {
+    dots[0] = 0.0;
+    dots[1] = 0.0;
+    dots[2] = 0.0;
+    dots[3] = 0.0;
   }
 
-  globaldots[0] = 0;
-  globaldots[1] = 0;
-  globaldots[2] = 0;
-  globaldots[3] = 0;
-  MPI_Iallreduce(localdots, globaldots, 4, MPI_DFLOAT, MPI_SUM, comm, &request);
+  for(int n=1;n<Nblocks;++n) {
+    dots[0] += dots[0+4*n];
+    dots[1] += dots[1+4*n];
+    dots[2] += dots[2+4*n];
+    dots[3] += dots[3+4*n];
+  }
+  comm.Iallreduce(dots, Comm::Sum, 4, request);
 }
 
-nbfpcg::~nbfpcg() {
-  update0NBFPCGKernel.free();
-  update1NBFPCGKernel.free();
-}
\ No newline at end of file
+} //namespace LinearSolver
+
+} //namespace libp
diff --git a/libs/linearSolver/linearSolverNBPCG.cpp b/libs/linearSolver/linearSolverNBPCG.cpp
index 63858cfa4..2d820babd 100644
--- a/libs/linearSolver/linearSolverNBPCG.cpp
+++ b/libs/linearSolver/linearSolverNBPCG.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,33 +26,36 @@ SOFTWARE.
 
 #include "linearSolver.hpp"
 
+namespace libp {
+
+namespace LinearSolver {
+
 #define NBPCG_BLOCKSIZE 512
 
 nbpcg::nbpcg(dlong _N, dlong _Nhalo,
-         platform_t& _platform, settings_t& _settings, MPI_Comm _comm):
-  linearSolver_t(_N, _Nhalo, _platform, _settings, _comm) {
+         platform_t& _platform, settings_t& _settings, comm_t _comm):
+  linearSolverBase_t(_N, _Nhalo, _platform, _settings, _comm) {
+
+  platform.linAlg().InitKernels({"axpy"});
 
   dlong Ntotal = N + Nhalo;
 
-  /*aux variables */
-  o_p  = platform.malloc(Ntotal*sizeof(dfloat));
-  o_s  = platform.malloc(Ntotal*sizeof(dfloat));
-  o_S  = platform.malloc(Ntotal*sizeof(dfloat));
-  o_z  = platform.malloc(Ntotal*sizeof(dfloat));
-  o_Z  = platform.malloc(Ntotal*sizeof(dfloat));
-  o_Ax = platform.malloc(Ntotal*sizeof(dfloat));
+  memory<dfloat> dummy(Ntotal, 0.0);
 
-  localdots  = (dfloat*) calloc(3, sizeof(dfloat));
-  globaldots = (dfloat*) calloc(3, sizeof(dfloat));
+  /*aux variables */
+  o_p  = platform.malloc<dfloat>(Ntotal, dummy);
+  o_s  = platform.malloc<dfloat>(Ntotal, dummy);
+  o_S  = platform.malloc<dfloat>(Ntotal, dummy);
+  o_z  = platform.malloc<dfloat>(Ntotal);
+  o_Z  = platform.malloc<dfloat>(Ntotal);
+  o_Ax = platform.malloc<dfloat>(Ntotal);
 
   //pinned tmp buffer for reductions
-  tmpdots = (dfloat*) platform.hostMalloc(3*NBPCG_BLOCKSIZE*sizeof(dfloat),
-                                          NULL, h_tmpdots);
-
-  o_tmpdots = platform.malloc(3*NBPCG_BLOCKSIZE*sizeof(dfloat));
+  dots = platform.hostMalloc<dfloat>(3*NBPCG_BLOCKSIZE);
+  o_dots = platform.malloc<dfloat>(3*NBPCG_BLOCKSIZE);
 
   /* build kernels */
-  occa::properties kernelInfo = platform.props; //copy base properties
+  properties_t kernelInfo = platform.props(); //copy base properties
 
   //add defines
   kernelInfo["defines/" "p_blockSize"] = (int)NBPCG_BLOCKSIZE;
@@ -64,13 +67,12 @@ nbpcg::nbpcg(dlong _N, dlong _Nhalo,
                                 "update2NBPCG", kernelInfo);
 }
 
-int nbpcg::Solve(solver_t& solver, precon_t& precon,
-                 occa::memory &o_x, occa::memory &o_r,
+int nbpcg::Solve(operator_t& linearOperator, operator_t& precon,
+                 deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_r,
                  const dfloat tol, const int MAXIT, const int verbose) {
 
-  int rank;
-  MPI_Comm_rank(comm, &rank);
-  linAlg_t &linAlg = platform.linAlg;
+  int rank = comm.rank();
+  linAlg_t &linAlg = platform.linAlg();
 
   // register scalars
   dfloat zdotz0 = 0;
@@ -84,7 +86,7 @@ int nbpcg::Solve(solver_t& solver, precon_t& precon,
   dfloat gamma1 = 0; // history gamma
 
   // compute A*x
-  solver.Operator(o_x, o_Ax);
+  linearOperator.Operator(o_x, o_Ax);
 
   // subtract r = r - A*x
   linAlg.axpy(N, -1.f, o_Ax, 1.f, o_r);
@@ -97,14 +99,14 @@ int nbpcg::Solve(solver_t& solver, precon_t& precon,
   alpha0 = 0;
   Update2NBPCG(alpha0, o_r);
 
-  solver.Operator(o_z, o_Z);
+  linearOperator.Operator(o_z, o_Z);
 
-  MPI_Wait(&request, &status);
-  gamma0 = globaldots[0]; // rdotz
-  zdotz0 = globaldots[1];
-  rdotr0 = globaldots[2];
+  comm.Wait(request);
+  gamma0 = dots[0]; // rdotz
+  zdotz0 = dots[1];
+  rdotr0 = dots[2];
 
-  dfloat TOL = mymax(tol*tol*rdotr0,tol*tol);
+  dfloat TOL = std::max(tol*tol*rdotr0,tol*tol);
 
   if (verbose&&(rank==0))
     printf("NBPCG: initial res norm %12.12f \n", sqrt(rdotr0));
@@ -125,8 +127,8 @@ int nbpcg::Solve(solver_t& solver, precon_t& precon,
     precon.Operator(o_s, o_S);
 
     // block for delta
-    MPI_Wait(&request, &status);
-    delta0 = globaldots[0];
+    comm.Wait(request);
+    delta0 = dots[0];
 
     // alpha = gamma/delta
     alpha0 = gamma0/delta0;
@@ -142,14 +144,14 @@ int nbpcg::Solve(solver_t& solver, precon_t& precon,
     linAlg.axpy(N, alpha0, o_p, 1.0, o_x);
 
     // Z = A*z
-    solver.Operator(o_z, o_Z);
+    linearOperator.Operator(o_z, o_Z);
 
     // block for delta
-    MPI_Wait(&request, &status);
+    comm.Wait(request);
     gamma1 = gamma0;
-    gamma0 = globaldots[0]; // gamma = r.z
-    zdotz0 = globaldots[1]; //
-    rdotr0 = globaldots[2]; //
+    gamma0 = dots[0]; // gamma = r.z
+    zdotz0 = dots[1]; //
+    rdotr0 = dots[2]; //
 
     beta0 = gamma0/gamma1;
 
@@ -171,21 +173,23 @@ void nbpcg::Update1NBPCG(const dfloat beta){
   // s <= Z + beta*s
   // dot(p,s)
   int Nblocks = (N+NBPCG_BLOCKSIZE-1)/NBPCG_BLOCKSIZE;
-  Nblocks = (Nblocks>NBPCG_BLOCKSIZE) ? NBPCG_BLOCKSIZE : Nblocks; //limit to NBPCG_BLOCKSIZE entries
+  Nblocks = std::min(Nblocks,NBPCG_BLOCKSIZE); //limit to NBPCG_BLOCKSIZE entries
 
-  update1NBPCGKernel(N, Nblocks, o_z, o_Z, beta, o_p, o_s, o_tmpdots);
+  update1NBPCGKernel(N, Nblocks, o_z, o_Z, beta, o_p, o_s, o_dots);
 
-  o_tmpdots.copyTo(tmpdots, Nblocks*sizeof(dfloat));
+  if (Nblocks>0) {
+    dots.copyFrom(o_dots, Nblocks);
+  } else {
+    dots[0] = 0.0;
+  }
 
-  localdots[0] = 0;
-  for(int n=0;n<Nblocks;++n)
-    localdots[0] += tmpdots[n];
+  for(int n=1;n<Nblocks;++n)
+    dots[0] += dots[n];
 
-  globaldots[0] = 0;
-  MPI_Iallreduce(localdots, globaldots, 1, MPI_DFLOAT, MPI_SUM, comm, &request);
+  comm.Iallreduce(dots, Comm::Sum, 1, request);
 }
 
-void nbpcg::Update2NBPCG(const dfloat alpha, occa::memory &o_r){
+void nbpcg::Update2NBPCG(const dfloat alpha, deviceMemory<dfloat>& o_r){
 
   // r <= r - alpha*s
   // z <= z - alpha*S
@@ -195,26 +199,24 @@ void nbpcg::Update2NBPCG(const dfloat alpha, occa::memory &o_r){
   int Nblocks = (N+NBPCG_BLOCKSIZE-1)/NBPCG_BLOCKSIZE;
   Nblocks = (Nblocks>NBPCG_BLOCKSIZE) ? NBPCG_BLOCKSIZE : Nblocks; //limit to NBPCG_BLOCKSIZE entries
 
-  update2NBPCGKernel(N, Nblocks, o_s, o_S, alpha, o_r, o_z, o_tmpdots);
-
-  o_tmpdots.copyTo(tmpdots, 3*Nblocks*sizeof(dfloat));
+  update2NBPCGKernel(N, Nblocks, o_s, o_S, alpha, o_r, o_z, o_dots);
 
-  localdots[0] = 0;
-  localdots[1] = 0;
-  localdots[2] = 0;
-  for(int n=0;n<Nblocks;++n) {
-    localdots[0] += tmpdots[0+3*n];
-    localdots[1] += tmpdots[1+3*n];
-    localdots[2] += tmpdots[2+3*n];
+  if (Nblocks>0) {
+    dots.copyFrom(o_dots, 3*Nblocks);
+  } else {
+    dots[0] = 0.0;
+    dots[1] = 0.0;
+    dots[2] = 0.0;
   }
 
-  globaldots[0] = 0;
-  globaldots[1] = 0;
-  globaldots[2] = 0;
-  MPI_Iallreduce(localdots, globaldots, 3, MPI_DFLOAT, MPI_SUM, comm, &request);
+  for(int n=1;n<Nblocks;++n) {
+    dots[0] += dots[0+3*n];
+    dots[1] += dots[1+3*n];
+    dots[2] += dots[2+3*n];
+  }
+  comm.Iallreduce(dots, Comm::Sum, 3, request);
 }
 
-nbpcg::~nbpcg() {
-  update1NBPCGKernel.free();
-  update2NBPCGKernel.free();
-}
\ No newline at end of file
+} //namespace LinearSolver
+
+} //namespace libp
diff --git a/libs/linearSolver/linearSolverPCG.cpp b/libs/linearSolver/linearSolverPCG.cpp
index 31b755fe5..b9cad17e2 100644
--- a/libs/linearSolver/linearSolverPCG.cpp
+++ b/libs/linearSolver/linearSolverPCG.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,31 +26,35 @@ SOFTWARE.
 
 #include "linearSolver.hpp"
 
+namespace libp {
+
+namespace LinearSolver {
+
 #define PCG_BLOCKSIZE 512
 
 pcg::pcg(dlong _N, dlong _Nhalo,
-         platform_t& _platform, settings_t& _settings, MPI_Comm _comm):
-  linearSolver_t(_N, _Nhalo, _platform, _settings, _comm) {
+         platform_t& _platform, settings_t& _settings, comm_t _comm):
+  linearSolverBase_t(_N, _Nhalo, _platform, _settings, _comm) {
+
+  platform.linAlg().InitKernels({"axpy", "innerProd", "norm2"});
 
   dlong Ntotal = N + Nhalo;
 
   flexible = settings.compareSetting("LINEAR SOLVER", "FPCG");
 
   /*aux variables */
-  dfloat *dummy = (dfloat *) calloc(Ntotal,sizeof(dfloat)); //need this to avoid uninitialized memory warnings
-  o_p  = platform.malloc(Ntotal*sizeof(dfloat),dummy);
-  o_z  = platform.malloc(Ntotal*sizeof(dfloat),dummy);
-  o_Ax = platform.malloc(Ntotal*sizeof(dfloat),dummy);
-  o_Ap = platform.malloc(Ntotal*sizeof(dfloat),dummy);
-  free(dummy);
+  memory<dfloat> dummy(Ntotal, 0.0); //need this to avoid uninitialized memory warnings
+  o_p  = platform.malloc<dfloat>(dummy);
+  o_z  = platform.malloc<dfloat>(dummy);
+  o_Ax = platform.malloc<dfloat>(dummy);
+  o_Ap = platform.malloc<dfloat>(dummy);
 
   //pinned tmp buffer for reductions
-  tmprdotr = (dfloat*) platform.hostMalloc(PCG_BLOCKSIZE*sizeof(dfloat),
-                                          NULL, h_tmprdotr);
-  o_tmprdotr = platform.malloc(PCG_BLOCKSIZE*sizeof(dfloat));
+  rdotr = platform.hostMalloc<dfloat>(PCG_BLOCKSIZE);
+  o_rdotr = platform.malloc<dfloat>(PCG_BLOCKSIZE);
 
   /* build kernels */
-  occa::properties kernelInfo = platform.props; //copy base properties
+  properties_t kernelInfo = platform.props(); //copy base properties
 
   //add defines
   kernelInfo["defines/" "p_blockSize"] = (int)PCG_BLOCKSIZE;
@@ -60,49 +64,48 @@ pcg::pcg(dlong _N, dlong _Nhalo,
                                 "updatePCG", kernelInfo);
 }
 
-int pcg::Solve(solver_t& solver, precon_t& precon,
-               occa::memory &o_x, occa::memory &o_r,
+int pcg::Solve(operator_t& linearOperator, operator_t& precon,
+               deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_r,
                const dfloat tol, const int MAXIT, const int verbose) {
 
-  int rank;
-  MPI_Comm_rank(comm, &rank);
-  linAlg_t &linAlg = platform.linAlg;
+  int rank = comm.rank();
+  linAlg_t &linAlg = platform.linAlg();
 
   // register scalars
   dfloat rdotz1 = 0.0;
   dfloat rdotz2 = 0.0;
   dfloat alpha = 0.0, beta = 0.0, pAp = 0.0;
-  dfloat rdotr = 0.0;
+  dfloat rdotr0 = 0.0;
   dfloat TOL = 0.0;
 
   // Comput norm of RHS (for stopping tolerance).
   if (settings.compareSetting("LINEAR SOLVER STOPPING CRITERION", "ABS/REL-RHS-2NORM")) {
     dfloat normb = linAlg.norm2(N, o_r, comm);
-    TOL = mymax(tol*tol*normb*normb, tol*tol);
+    TOL = std::max(tol*tol*normb*normb, tol*tol);
   }
 
   // compute A*x
-  solver.Operator(o_x, o_Ax);
+  linearOperator.Operator(o_x, o_Ax);
 
   // subtract r = r - A*x
   linAlg.axpy(N, -1.f, o_Ax, 1.f, o_r);
 
-  rdotr = linAlg.norm2(N, o_r, comm);
-  rdotr = rdotr*rdotr;
+  rdotr0 = linAlg.norm2(N, o_r, comm);
+  rdotr0 = rdotr0*rdotr0;
 
   if (settings.compareSetting("LINEAR SOLVER STOPPING CRITERION", "ABS/REL-INITRESID")) {
-    TOL = mymax(tol*tol*rdotr,tol*tol);
+    TOL = std::max(tol*tol*rdotr0,tol*tol);
   }
 
   if (verbose&&(rank==0))
-    printf("PCG: initial res norm %12.12f \n", sqrt(rdotr));
+    printf("PCG: initial res norm %12.12f \n", sqrt(rdotr0));
 
   int iter;
   for(iter=0;iter<MAXIT;++iter){
 
     // Exit if tolerance is reached, taking at least one step.
-    if (((iter == 0) && (rdotr == 0.0)) ||
-        ((iter > 0) && (rdotr <= TOL))) {
+    if (((iter == 0) && (rdotr0 == 0.0)) ||
+        ((iter > 0) && (rdotr0 <= TOL))) {
       break;
     }
 
@@ -124,7 +127,7 @@ int pcg::Solve(solver_t& solver, precon_t& precon,
     linAlg.axpy(N, 1.f, o_z, beta, o_p);
 
     // A*p
-    solver.Operator(o_p, o_Ap);
+    linearOperator.Operator(o_p, o_Ap);
 
     // p.Ap
     pAp =  linAlg.innerProd(N, o_p, o_Ap, comm);
@@ -134,40 +137,39 @@ int pcg::Solve(solver_t& solver, precon_t& precon,
     //  x <= x + alpha*p
     //  r <= r - alpha*A*p
     //  dot(r,r)
-    rdotr = UpdatePCG(alpha, o_x, o_r);
+    rdotr0 = UpdatePCG(alpha, o_x, o_r);
 
     if (verbose&&(rank==0)) {
-      if(rdotr<0)
-        printf("WARNING CG: rdotr = %17.15lf\n", rdotr);
+      if(rdotr0<0)
+        printf("WARNING CG: rdotr = %17.15lf\n", rdotr0);
 
-      printf("CG: it %d, r norm %12.12le, alpha = %le \n", iter+1, sqrt(rdotr), alpha);
+      printf("CG: it %d, r norm %12.12le, alpha = %le \n", iter+1, sqrt(rdotr0), alpha);
     }
   }
 
   return iter;
 }
 
-dfloat pcg::UpdatePCG(const dfloat alpha, occa::memory &o_x, occa::memory &o_r){
+dfloat pcg::UpdatePCG(const dfloat alpha, deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_r){
 
   // x <= x + alpha*p
   // r <= r - alpha*A*p
   // dot(r,r)
   int Nblocks = (N+PCG_BLOCKSIZE-1)/PCG_BLOCKSIZE;
-  Nblocks = (Nblocks>PCG_BLOCKSIZE) ? PCG_BLOCKSIZE : Nblocks; //limit to PCG_BLOCKSIZE entries
+  Nblocks = std::min(Nblocks, PCG_BLOCKSIZE); //limit to PCG_BLOCKSIZE entries
 
-  updatePCGKernel(N, Nblocks, o_p, o_Ap, alpha, o_x, o_r, o_tmprdotr);
+  updatePCGKernel(N, Nblocks, o_p, o_Ap, alpha, o_x, o_r, o_rdotr);
 
-  o_tmprdotr.copyTo(tmprdotr, Nblocks*sizeof(dfloat));
+  rdotr.copyFrom(o_rdotr, Nblocks);
 
   dfloat rdotr1 = 0;
   for(int n=0;n<Nblocks;++n)
-    rdotr1 += tmprdotr[n];
+    rdotr1 += rdotr[n];
 
-  dfloat globalrdotr1 = 0;
-  MPI_Allreduce(&rdotr1, &globalrdotr1, 1, MPI_DFLOAT, MPI_SUM, comm);
-  return globalrdotr1;
+  comm.Allreduce(rdotr1);
+  return rdotr1;
 }
 
-pcg::~pcg() {
-  updatePCGKernel.free();
-}
+} //namespace LinearSolver
+
+} //namespace libp
diff --git a/libs/linearSolver/linearSolverPGMRES.cpp b/libs/linearSolver/linearSolverPGMRES.cpp
index a8051961f..0f837d4e7 100644
--- a/libs/linearSolver/linearSolverPGMRES.cpp
+++ b/libs/linearSolver/linearSolverPGMRES.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,16 +26,19 @@ SOFTWARE.
 
 #include "linearSolver.hpp"
 
+namespace libp {
+
+namespace LinearSolver {
+
 #define PGMRES_RESTART 20
 
 pgmres::pgmres(dlong _N, dlong _Nhalo,
-         platform_t& _platform, settings_t& _settings, MPI_Comm _comm):
-  linearSolver_t(_N, _Nhalo, _platform, _settings, _comm) {
+         platform_t& _platform, settings_t& _settings, comm_t _comm):
+  linearSolverBase_t(_N, _Nhalo, _platform, _settings, _comm) {
 
   // Make sure LinAlg has the necessary kernels
-  platform.linAlg.InitKernels({"axpy", "zaxpy",
-                               "innerProd", "weightedInnerProd",
-                               "norm2", "weightedNorm2"});
+  platform.linAlg().InitKernels({"axpy", "zaxpy",
+                               "innerProd", "norm2"});
 
   dlong Ntotal = N + Nhalo;
 
@@ -43,36 +46,34 @@ pgmres::pgmres(dlong _N, dlong _Nhalo,
   //TODO make this modifyable via settings
   restart=PGMRES_RESTART;
 
-  dfloat *dummy = (dfloat *) calloc(Ntotal,sizeof(dfloat)); //need this to avoid uninitialized memory warnings
+  memory<dfloat> dummy(Ntotal, 0.0); //need this to avoid uninitialized memory warnings
 
-  o_V = new occa::memory[restart];
+  o_V.malloc(restart);
   for(int i=0; i<restart; ++i){
-    o_V[i] = platform.malloc(Ntotal*sizeof(dfloat), dummy);
+    o_V[i] = platform.malloc<dfloat>(dummy);
   }
 
-  H  = (dfloat *) calloc((restart+1)*(restart+1), sizeof(dfloat));
-  sn = (dfloat *) calloc(restart, sizeof(dfloat));
-  cs = (dfloat *) calloc(restart, sizeof(dfloat));
-  s = (dfloat *) calloc(restart+1, sizeof(dfloat));
-  y = (dfloat *) calloc(restart, sizeof(dfloat));
+  H .malloc((restart+1)*(restart+1), 0.0);
+  sn.malloc(restart);
+  cs.malloc(restart);
+  s.malloc(restart+1);
+  y.malloc(restart);
 
   /*aux variables */
-  o_Ax = platform.malloc(Ntotal*sizeof(dfloat), dummy);
-  o_z  = platform.malloc(Ntotal*sizeof(dfloat), dummy);
-  o_r  = platform.malloc(Ntotal*sizeof(dfloat), dummy);
-  free(dummy);
+  o_Ax = platform.malloc<dfloat>(dummy);
+  o_z  = platform.malloc<dfloat>(dummy);
+  o_r  = platform.malloc<dfloat>(dummy);
 }
 
-int pgmres::Solve(solver_t& solver, precon_t& precon,
-               occa::memory &o_x, occa::memory &o_b,
+int pgmres::Solve(operator_t& linearOperator, operator_t& precon,
+               deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_b,
                const dfloat tol, const int MAXIT, const int verbose) {
 
-  int rank;
-  MPI_Comm_rank(comm, &rank);
-  linAlg_t &linAlg = platform.linAlg;
+  int rank = comm.rank();
+  linAlg_t &linAlg = platform.linAlg();
 
   // compute A*x
-  solver.Operator(o_x, o_Ax);
+  linearOperator.Operator(o_x, o_Ax);
 
   // subtract z = b - A*x
   linAlg.zaxpy(N, -1.f, o_Ax, 1.f, o_b, o_z);
@@ -83,7 +84,7 @@ int pgmres::Solve(solver_t& solver, precon_t& precon,
   dfloat nr = linAlg.norm2(N, o_r, comm);
 
   dfloat error = nr;
-  const dfloat TOL = mymax(tol*nr,tol);
+  const dfloat TOL = std::max(tol*nr,tol);
 
   if (verbose&&(rank==0))
     printf("PGMRES: initial res norm %12.12f \n", nr);
@@ -104,7 +105,7 @@ int pgmres::Solve(solver_t& solver, precon_t& precon,
     //Construct orthonormal basis via Gram-Schmidt
     for(int i=0;i<restart;++i){
       // compute z = A*V(:,i)
-      solver.Operator(o_V[i], o_z);
+      linearOperator.Operator(o_V[i], o_z);
 
       // r = Precon^{-1} z
       precon.Operator(o_z, o_r);
@@ -150,7 +151,7 @@ int pgmres::Solve(solver_t& solver, precon_t& precon,
       s[i]   =  cs[i]*s[i];
 
       iter++;
-      error = fabs(s[i+1]);
+      error = std::abs(s[i+1]);
 
       if (verbose&&(rank==0)) {
         printf("GMRES: it %d, approx residual norm %12.12le \n", iter, error);
@@ -170,7 +171,7 @@ int pgmres::Solve(solver_t& solver, precon_t& precon,
     UpdateGMRES(o_x, restart);
 
     // compute A*x
-    solver.Operator(o_x, o_Ax);
+    linearOperator.Operator(o_x, o_Ax);
 
     // subtract z = b - A*x
     linAlg.zaxpy(N, -1.f, o_Ax, 1.f, o_b, o_z);
@@ -188,7 +189,7 @@ int pgmres::Solve(solver_t& solver, precon_t& precon,
   return iter;
 }
 
-void pgmres::UpdateGMRES(occa::memory& o_x, const int I){
+void pgmres::UpdateGMRES(deviceMemory<dfloat>& o_x, const int I){
 
   for(int k=I-1; k>=0; --k){
     y[k] = s[k];
@@ -201,16 +202,10 @@ void pgmres::UpdateGMRES(occa::memory& o_x, const int I){
 
   //TODO this is really a GEMM, should write it that way
   for(int j=0; j<I; ++j){
-    platform.linAlg.axpy(N, y[j], o_V[j], 1.0, o_x);
+    platform.linAlg().axpy(N, y[j], o_V[j], 1.0, o_x);
   }
 }
 
-pgmres::~pgmres() {
-  if(H) free(H);
-  if(sn) free(sn);
-  if(cs) free(cs);
-  if(s) free(s);
-  if(y) free(y);
+} //namespace LinearSolver
 
-  if (o_V) delete[] o_V;
-}
\ No newline at end of file
+} //namespace libp
diff --git a/libs/linearSolver/linearSolverPMINRES.cpp b/libs/linearSolver/linearSolverPMINRES.cpp
index e9afe3617..d48570042 100644
--- a/libs/linearSolver/linearSolverPMINRES.cpp
+++ b/libs/linearSolver/linearSolverPMINRES.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Anthony Austin
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,60 +26,54 @@ SOFTWARE.
 
 #include "linearSolver.hpp"
 
+namespace libp {
+
+namespace LinearSolver {
+
 pminres::pminres(dlong _N, dlong _Nhalo,
-                 platform_t& _platform, settings_t& _settings, MPI_Comm _comm):
-  linearSolver_t(_N, _Nhalo, _platform, _settings, _comm)
+                 platform_t& _platform, settings_t& _settings, comm_t _comm):
+  linearSolverBase_t(_N, _Nhalo, _platform, _settings, _comm)
 {
-  platform.linAlg.InitKernels({"axpy", "zaxpy", "scale", "set",
-                               "innerProd", "weightedInnerProd",
-                               "norm2", "weightedNorm2"});
+  platform.linAlg().InitKernels({"axpy", "scale", "innerProd"});
 
   dlong Ntotal = N + Nhalo;
 
-  dfloat *dummy = new dfloat[Ntotal]();
-  o_p     = platform.malloc(Ntotal*sizeof(dfloat), dummy);
-  o_z     = platform.malloc(Ntotal*sizeof(dfloat), dummy);
-  o_r     = platform.malloc(Ntotal*sizeof(dfloat), dummy);
-  o_r_old = platform.malloc(Ntotal*sizeof(dfloat), dummy);
-  o_q     = platform.malloc(Ntotal*sizeof(dfloat), dummy);
-  o_q_old = platform.malloc(Ntotal*sizeof(dfloat), dummy);
-  delete[] dummy;
+  memory<dfloat> dummy(Ntotal, 0.0);
+  o_p     = platform.malloc<dfloat>(dummy);
+  o_z     = platform.malloc<dfloat>(dummy);
+  o_r     = platform.malloc<dfloat>(dummy);
+  o_r_old = platform.malloc<dfloat>(dummy);
+  o_q     = platform.malloc<dfloat>(dummy);
+  o_q_old = platform.malloc<dfloat>(dummy);
 
-  occa::properties kernelInfo = platform.props;
+  properties_t kernelInfo = platform.props();
   updateMINRESKernel = platform.buildKernel(LINEARSOLVER_DIR "/okl/linearSolverUpdateMINRES.okl", "updateMINRES", kernelInfo);
-
-  return;
-}
-
-pminres::~pminres()
-{
-  return;
 }
 
-int pminres::Solve(solver_t& solver, precon_t& precon,
-                   occa::memory &o_x, occa::memory &o_b,
+int pminres::Solve(operator_t& linearOperator, operator_t& precon,
+                   deviceMemory<dfloat>& o_x, deviceMemory<dfloat>& o_b,
                    const dfloat tol, const int MAXIT, const int verbose)
 {
-  int    rank, iter;
+  int iter;
   dfloat a0, a1, a2, a3, del, gam, gamp, c, cp, s, sp, eta;
   dfloat TOL;
 
-  MPI_Comm_rank(comm, &rank);
-  linAlg_t &linAlg = platform.linAlg;
+  int rank = comm.rank();
+  linAlg_t &linAlg = platform.linAlg();
 
-  solver.Operator(o_x, o_r);            // r = b - A*x
+  linearOperator.Operator(o_x, o_r);            // r = b - A*x
   linAlg.axpy(N, 1.0, o_b, -1.0, o_r);
   precon.Operator(o_r, o_z);            // z = M\r
 
   gamp = 0.0;
-  gam  = sqrt(innerProd(o_z, o_r));     // gam = sqrt(z . r);
+  gam  = sqrt(linAlg.innerProd(N, o_z, o_r, comm)); // gam = sqrt(z . r);
   eta  = gam;
   sp   = 0.0;
   s    = 0.0;
   cp   = 1.0;
   c    = 1.0;
 
-  TOL = mymax(tol*fabs(eta), tol);
+  TOL = std::max(tol*std::abs(eta), tol);
   if (verbose && (rank == 0)) {
     printf("PMINRES:  initial eta = % .15e, target %.15e\n", eta, tol);
   }
@@ -91,7 +85,7 @@ int pminres::Solve(solver_t& solver, precon_t& precon,
       printf("PMINRES:  it %3d  eta = % .15e, gamma = %.15e\n", iter, eta, gam);
     }
 
-    if ((fabs(eta) < TOL) && (iter >= 1)) {
+    if ((std::abs(eta) < TOL) && (iter >= 1)) {
       if (verbose && (rank == 0)) {
         printf("PMINRES converged in %d iterations (eta = % .15e).\n", iter, eta);
       }
@@ -99,8 +93,8 @@ int pminres::Solve(solver_t& solver, precon_t& precon,
     }
 
     linAlg.scale(N, 1.0/gam, o_z);                    // z = z/gam
-    solver.Operator(o_z, o_p);                        // p = A*z
-    del = innerProd(o_p, o_z);                        // del = p . z
+    linearOperator.Operator(o_z, o_p);                        // p = A*z
+    del = linAlg.innerProd(N, o_p, o_z, comm);        // del = p . z
     a0 = c*del - cp*s*gam;
     a2 = s*del + cp*c*gam;
     a3 = sp*gam;
@@ -122,7 +116,7 @@ int pminres::Solve(solver_t& solver, precon_t& precon,
 #endif
     precon.Operator(o_r, o_z);                        // z = M\r
     gamp = gam;
-    gam  = sqrt(innerProd(o_z, o_r));                 // gam = sqrt(z . r)
+    gam  = sqrt(linAlg.innerProd(N, o_z, o_r, comm)); // gam = sqrt(z . r)
     a1   = sqrt(a0*a0 + gam*gam);
     cp   = c;
     c    = a0/a1;
@@ -138,12 +132,11 @@ int pminres::Solve(solver_t& solver, precon_t& precon,
   return iter;
 }
 
-dfloat pminres::innerProd(occa::memory& o_x, occa::memory& o_y)
-{
-  return platform.linAlg.innerProd(N, o_x, o_y, comm);
-}
-
 void pminres::UpdateMINRES(const dfloat ma2, const dfloat ma3, const dfloat alpha, const dfloat beta)
 {
   updateMINRESKernel(N, ma2, ma3, alpha, beta, o_z, o_q_old, o_q, o_r_old, o_r, o_p);
 }
+
+} //namespace LinearSolver
+
+} //namespace libp
diff --git a/libs/linearSolver/okl/igBasisInnerProducts.okl b/libs/linearSolver/okl/igBasisInnerProducts.okl
index 64e9fb165..7b5d12647 100644
--- a/libs/linearSolver/okl/igBasisInnerProducts.okl
+++ b/libs/linearSolver/okl/igBasisInnerProducts.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Anthony Austin
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -35,7 +35,7 @@ SOFTWARE.
 {
   for (dlong b = 0; b < Nblocks; ++b; @outer(0)) {
 
-    @shared volatile dfloat s_wxy[p_blockSize];
+    @shared dfloat s_wxy[p_blockSize];
 
     // load x to register
     // block over igNhist
@@ -52,7 +52,6 @@ SOFTWARE.
 
     for (int fld = 0; fld < dim; ++fld) {
 
-      @barrier("local");
 
       for (int t = 0; t < p_blockSize; ++t; @inner(0)) {
         dlong id = t + p_blockSize*b;
@@ -68,7 +67,6 @@ SOFTWARE.
         s_wxy[t] = res;
       }
 
-      @barrier("local");
 
 #if p_blockSize>512
       for(int t=0;t<p_blockSize;++t;@inner(0))
@@ -76,7 +74,6 @@ SOFTWARE.
           s_wxy[t] += s_wxy[t+512];
         }
 
-      @barrier("local");
 #endif
 
 #if p_blockSize>256
@@ -85,7 +82,6 @@ SOFTWARE.
           s_wxy[t] += s_wxy[t+256];
         }
 
-      @barrier("local");
 #endif
 
       for(int t=0;t<p_blockSize;++t;@inner(0))
@@ -93,14 +89,12 @@ SOFTWARE.
           s_wxy[t] += s_wxy[t+128];
         }
 
-      @barrier("local");
 
       for(int t=0;t<p_blockSize;++t;@inner(0))
         if(t< 64){
           s_wxy[t] += s_wxy[t+64];
         }
 
-      @barrier("local");
 
       for(int t=0;t<p_blockSize;++t;@inner(0))
         if(t< 32){
@@ -127,7 +121,6 @@ SOFTWARE.
           s_wxy[t] += s_wxy[t+2];
         }
 
-      @barrier("local");
 
       // assumes igNhist < p_blockSize
       for(int t=0;t<p_blockSize;++t;@inner(0)){
diff --git a/libs/linearSolver/okl/igDropQRFirstColumn.okl b/libs/linearSolver/okl/igDropQRFirstColumn.okl
index 495f8cc4d..3be100c2f 100644
--- a/libs/linearSolver/okl/igDropQRFirstColumn.okl
+++ b/libs/linearSolver/okl/igDropQRFirstColumn.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Anthony Austin
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -74,7 +74,6 @@ void givensRotation(dfloat a, dfloat b, dfloat *c, dfloat *s)
 
     for (int i = 0; i < p_igNhist - 1; ++i) {
 
-      @barrier("local");
 
       // thread 0 computes the Given's rotation and updates R
       for (dlong t = 0; t < p_NT; ++t; @inner(0)) {
@@ -102,7 +101,6 @@ void givensRotation(dfloat a, dfloat b, dfloat *c, dfloat *s)
         }
       }
 
-      @barrier("local");
 
       // all threads perform Givens rotations at a node
       for (dlong t = 0; t < p_NT; ++t; @inner(0)) {
@@ -129,8 +127,6 @@ void givensRotation(dfloat a, dfloat b, dfloat *c, dfloat *s)
       }
     }
 
-    @barrier("global");
-
     for (dlong t = 0; t < p_NT; ++t; @inner(0)) {
       const dlong n = b*p_NT + t;
       if (n < Ndof) {
diff --git a/libs/linearSolver/okl/igExtrap.okl b/libs/linearSolver/okl/igExtrap.okl
index de2ced7cd..18562b512 100644
--- a/libs/linearSolver/okl/igExtrap.okl
+++ b/libs/linearSolver/okl/igExtrap.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Anthony Austin
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/libs/linearSolver/okl/igReconstruct.okl b/libs/linearSolver/okl/igReconstruct.okl
index cdd342d58..0a34a1c70 100644
--- a/libs/linearSolver/okl/igReconstruct.okl
+++ b/libs/linearSolver/okl/igReconstruct.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Anthony Austin
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -43,7 +43,6 @@ SOFTWARE.
         s_alphas[t] = (t<igNhist) ? beta*alphas[t] : 0;
     }
 
-    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)){
 
diff --git a/libs/linearSolver/okl/igScale.okl b/libs/linearSolver/okl/igScale.okl
index 25022b125..b32d7fdfa 100644
--- a/libs/linearSolver/okl/igScale.okl
+++ b/libs/linearSolver/okl/igScale.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Anthony Austin
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/libs/linearSolver/okl/igUpdate.okl b/libs/linearSolver/okl/igUpdate.okl
index 84dec97ca..798d3a79a 100644
--- a/libs/linearSolver/okl/igUpdate.okl
+++ b/libs/linearSolver/okl/igUpdate.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Anthony Austin
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/libs/linearSolver/okl/linearSolverUpdateMINRES.okl b/libs/linearSolver/okl/linearSolverUpdateMINRES.okl
index c7ef6491e..455225ba1 100644
--- a/libs/linearSolver/okl/linearSolverUpdateMINRES.okl
+++ b/libs/linearSolver/okl/linearSolverUpdateMINRES.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Anthony Austin
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/libs/linearSolver/okl/linearSolverUpdateNBFPCG.okl b/libs/linearSolver/okl/linearSolverUpdateNBFPCG.okl
index ae140cd2b..b5c067ffe 100644
--- a/libs/linearSolver/okl/linearSolverUpdateNBFPCG.okl
+++ b/libs/linearSolver/okl/linearSolverUpdateNBFPCG.okl
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -35,7 +35,7 @@
 
   for(dlong b=0;b<Nblocks;++b;@outer(0)){
 
-    @shared volatile dfloat s_dot[3][p_blockSize];
+    @shared dfloat s_dot[3][p_blockSize];
 
     for(int t=0;t<p_blockSize;++t;@inner(0)){
 
@@ -54,7 +54,6 @@
       s_dot[2][t] = sumrdotr;
     }
 
-    @barrier("local");
 
 #if p_blockSize>512
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<512) {
@@ -62,7 +61,6 @@
       s_dot[1][t] += s_dot[1][t+512];
       s_dot[2][t] += s_dot[2][t+512];
     }
-    @barrier("local");
 #endif
 
 #if p_blockSize>256
@@ -71,7 +69,6 @@
       s_dot[1][t] += s_dot[1][t+256];
       s_dot[2][t] += s_dot[2][t+256];
     }
-    @barrier("local");
 #endif
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<128) {
@@ -79,49 +76,42 @@
       s_dot[1][t] += s_dot[1][t+128];
       s_dot[2][t] += s_dot[2][t+128];
     }
-    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 64) {
       s_dot[0][t] += s_dot[0][t+ 64];
       s_dot[1][t] += s_dot[1][t+ 64];
       s_dot[2][t] += s_dot[2][t+ 64];
     }
-    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 32) {
       s_dot[0][t] += s_dot[0][t+ 32];
       s_dot[1][t] += s_dot[1][t+ 32];
       s_dot[2][t] += s_dot[2][t+ 32];
     }
-    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 16) {
       s_dot[0][t] += s_dot[0][t+ 16];
       s_dot[1][t] += s_dot[1][t+ 16];
       s_dot[2][t] += s_dot[2][t+ 16];
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  8) {
       s_dot[0][t] += s_dot[0][t+  8];
       s_dot[1][t] += s_dot[1][t+  8];
       s_dot[2][t] += s_dot[2][t+  8];
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  4) {
       s_dot[0][t] += s_dot[0][t+  4];
       s_dot[1][t] += s_dot[1][t+  4];
       s_dot[2][t] += s_dot[2][t+  4];
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  2) {
       s_dot[0][t] += s_dot[0][t+  2];
       s_dot[1][t] += s_dot[1][t+  2];
       s_dot[2][t] += s_dot[2][t+  2];
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  1) {
       dots[0+3*b] = s_dot[0][0] + s_dot[0][1];
@@ -147,7 +137,7 @@
 
   for(dlong b=0;b<Nblocks;++b;@outer(0)){
 
-    @shared volatile dfloat s_dot[4][p_blockSize];
+    @shared dfloat s_dot[4][p_blockSize];
 
     for(int t=0;t<p_blockSize;++t;@inner(0)){
 
@@ -189,7 +179,6 @@
       s_dot[3][t] = sumrdotr;
     }
 
-    @barrier("local");
 
 #if p_blockSize>512
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<512) {
@@ -198,7 +187,6 @@
       s_dot[2][t] += s_dot[2][t+512];
       s_dot[3][t] += s_dot[3][t+512];
     }
-    @barrier("local");
 #endif
 
 #if p_blockSize>256
@@ -208,7 +196,6 @@
       s_dot[2][t] += s_dot[2][t+256];
       s_dot[3][t] += s_dot[3][t+256];
     }
-    @barrier("local");
 #endif
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<128) {
@@ -217,7 +204,6 @@
       s_dot[2][t] += s_dot[2][t+128];
       s_dot[3][t] += s_dot[3][t+128];
     }
-    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 64) {
       s_dot[0][t] += s_dot[0][t+ 64];
@@ -225,7 +211,6 @@
       s_dot[2][t] += s_dot[2][t+ 64];
       s_dot[3][t] += s_dot[3][t+ 64];
     }
-    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 32) {
       s_dot[0][t] += s_dot[0][t+ 32];
@@ -233,7 +218,6 @@
       s_dot[2][t] += s_dot[2][t+ 32];
       s_dot[3][t] += s_dot[3][t+ 32];
     }
-    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 16) {
       s_dot[0][t] += s_dot[0][t+ 16];
@@ -241,7 +225,6 @@
       s_dot[2][t] += s_dot[2][t+ 16];
       s_dot[3][t] += s_dot[3][t+ 16];
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  8) {
       s_dot[0][t] += s_dot[0][t+  8];
@@ -249,7 +232,6 @@
       s_dot[2][t] += s_dot[2][t+  8];
       s_dot[3][t] += s_dot[3][t+  8];
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  4) {
       s_dot[0][t] += s_dot[0][t+  4];
@@ -257,7 +239,6 @@
       s_dot[2][t] += s_dot[2][t+  4];
       s_dot[3][t] += s_dot[3][t+  4];
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  2) {
       s_dot[0][t] += s_dot[0][t+  2];
@@ -265,7 +246,6 @@
       s_dot[2][t] += s_dot[2][t+  2];
       s_dot[3][t] += s_dot[3][t+  2];
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  1) {
       dots[0+4*b] = s_dot[0][0] + s_dot[0][1];
diff --git a/libs/linearSolver/okl/linearSolverUpdateNBPCG.okl b/libs/linearSolver/okl/linearSolverUpdateNBPCG.okl
index 8930c7507..78eb26b94 100644
--- a/libs/linearSolver/okl/linearSolverUpdateNBPCG.okl
+++ b/libs/linearSolver/okl/linearSolverUpdateNBPCG.okl
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -37,7 +37,7 @@
 
   for(dlong b=0;b<Nblocks;++b;@outer(0)){
 
-    @shared volatile dfloat s_dot[p_blockSize];
+    @shared dfloat s_dot[p_blockSize];
 
     for(int t=0;t<p_blockSize;++t;@inner(0)){
 
@@ -60,38 +60,28 @@
       s_dot[t] = sum;
     }
 
-    @barrier("local");
 
 #if p_blockSize>512
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<512) s_dot[t] += s_dot[t+512];
-    @barrier("local");
 #endif
 
 #if p_blockSize>256
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<256) s_dot[t] += s_dot[t+256];
-    @barrier("local");
 #endif
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<128) s_dot[t] += s_dot[t+128];
-    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 64) s_dot[t] += s_dot[t+ 64];
-    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 32) s_dot[t] += s_dot[t+ 32];
-    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 16) s_dot[t] += s_dot[t+ 16];
-    //    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  8) s_dot[t] += s_dot[t+  8];
-    //    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  4) s_dot[t] += s_dot[t+  4];
-    //    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  2) s_dot[t] += s_dot[t+  2];
-    //    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  1) pdots[b] = s_dot[0] + s_dot[1];
   }
@@ -108,7 +98,7 @@
 
   for(dlong b=0;b<Nblocks;++b;@outer(0)){
 
-    @shared volatile dfloat s_dot[3][p_blockSize];
+    @shared dfloat s_dot[3][p_blockSize];
 
     for(int t=0;t<p_blockSize;++t;@inner(0)){
 
@@ -138,7 +128,6 @@
       s_dot[2][t] = sumrdotr;
     }
 
-    @barrier("local");
 
 #if p_blockSize>512
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<512) {
@@ -146,7 +135,6 @@
       s_dot[1][t] += s_dot[1][t+512];
       s_dot[2][t] += s_dot[2][t+512];
     }
-    @barrier("local");
 #endif
 
 #if p_blockSize>256
@@ -155,7 +143,6 @@
       s_dot[1][t] += s_dot[1][t+256];
       s_dot[2][t] += s_dot[2][t+256];
     }
-    @barrier("local");
 #endif
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<128) {
@@ -163,49 +150,42 @@
       s_dot[1][t] += s_dot[1][t+128];
       s_dot[2][t] += s_dot[2][t+128];
     }
-    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 64) {
       s_dot[0][t] += s_dot[0][t+ 64];
       s_dot[1][t] += s_dot[1][t+ 64];
       s_dot[2][t] += s_dot[2][t+ 64];
     }
-    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 32) {
       s_dot[0][t] += s_dot[0][t+ 32];
       s_dot[1][t] += s_dot[1][t+ 32];
       s_dot[2][t] += s_dot[2][t+ 32];
     }
-    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 16) {
       s_dot[0][t] += s_dot[0][t+ 16];
       s_dot[1][t] += s_dot[1][t+ 16];
       s_dot[2][t] += s_dot[2][t+ 16];
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  8) {
       s_dot[0][t] += s_dot[0][t+  8];
       s_dot[1][t] += s_dot[1][t+  8];
       s_dot[2][t] += s_dot[2][t+  8];
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  4) {
       s_dot[0][t] += s_dot[0][t+  4];
       s_dot[1][t] += s_dot[1][t+  4];
       s_dot[2][t] += s_dot[2][t+  4];
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  2) {
       s_dot[0][t] += s_dot[0][t+  2];
       s_dot[1][t] += s_dot[1][t+  2];
       s_dot[2][t] += s_dot[2][t+  2];
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  1) {
       dots[0+3*b] = s_dot[0][0] + s_dot[0][1];
diff --git a/libs/linearSolver/okl/linearSolverUpdatePCG.okl b/libs/linearSolver/okl/linearSolverUpdatePCG.okl
index d3d4ec838..84be96da0 100644
--- a/libs/linearSolver/okl/linearSolverUpdatePCG.okl
+++ b/libs/linearSolver/okl/linearSolverUpdatePCG.okl
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -54,39 +54,22 @@
         id += p_blockSize*Nblocks;
       }
     }
-    @barrier("local");
 
 #if p_blockSize>512
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<512) s_dot[t] += s_dot[t+512];
-    @barrier("local");
 #endif
 
 #if p_blockSize>256
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<256) s_dot[t] += s_dot[t+256];
-    @barrier("local");
 #endif
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<128) s_dot[t] += s_dot[t+128];
-    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 64) s_dot[t] += s_dot[t+ 64];
-    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 32) s_dot[t] += s_dot[t+ 32];
-    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 16) s_dot[t] += s_dot[t+ 16];
-    //    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  8) s_dot[t] += s_dot[t+  8];
-    //    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  4) s_dot[t] += s_dot[t+  4];
-    //    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  2) s_dot[t] += s_dot[t+  2];
-    //    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  1) redr[b] = s_dot[0] + s_dot[1];
   }
 }
diff --git a/libs/makefile b/libs/makefile
index 03349971f..3e9fb304e 100644
--- a/libs/makefile
+++ b/libs/makefile
@@ -2,7 +2,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
@@ -39,7 +39,7 @@ Libraries makefile targets:
 Usage:
 
 make all
-	 Build core, ogs, mesh, linAlg, timeStepper, linearSovler, and parAlmond libraries.
+	 Build core, ogs, parAdogs, mesh, linAlg, timeStepper, linearSovler, and parAlmond libraries.
 make {lib}
 	 Build only {lib} library.
 make clean
@@ -57,7 +57,7 @@ Can use "make verbose=true" for verbose output.
 
 endef
 
-ifeq (,$(filter core linAlg linearSolver mesh ogs parAlmond timeStepper \
+ifeq (,$(filter core linAlg linearSolver parAdogs mesh ogs parAlmond timeStepper \
 								clean clean-libs clean-kernels realclean info help, $(MAKECMDGOALS)))
 ifneq (,$(MAKECMDGOALS))
 $(error ${LIB_HELP_MSG})
@@ -72,13 +72,11 @@ include ../make.top
 endif
 endif
 
-#third party
-GS_DIR=${LIBP_TPL_DIR}/gslib
-
 #libraries
 LIBCORE_DIR=${LIBP_LIBS_DIR}/core
 LIBLINALG_DIR=${LIBP_LIBS_DIR}/linAlg
 LIBLINEARSOLVER_DIR=${LIBP_LIBS_DIR}/linearSolver
+LIBPARADOGS_DIR=${LIBP_LIBS_DIR}/parAdogs
 LIBMESH_DIR=${LIBP_LIBS_DIR}/mesh
 LIBOGS_DIR=${LIBP_LIBS_DIR}/ogs
 LIBPARALMOND_DIR=${LIBP_LIBS_DIR}/parAlmond
@@ -91,13 +89,18 @@ INCLUDES=${LIBP_INCLUDES}
 LIBCORE_DEFINES=-DLIBP_DIR='"${LIBP_DIR}"'
 LIBLINALG_DEFINES=-DLINALG_DIR='"${LIBLINALG_DIR}"'
 LIBLINEARSOLVER_DEFINES=-DLINEARSOLVER_DIR='"${LIBLINEARSOLVER_DIR}"'
+LIBPARADOGS_DEFINES=-DPARADOGS_DIR='"${PARADOGS_DIR}"'
 LIBMESH_DEFINES=-DMESH_DIR='"${LIBMESH_DIR}"'
 LIBOGS_DEFINES=-DLIBP_DIR='"${LIBP_DIR}"'  -DOGS_DIR='"${LIBOGS_DIR}"'
 LIBPARALMOND_DEFINES=-DPARALMOND_DIR='"${LIBPARALMOND_DIR}"'
 LIBTIMESTEPPER_DEFINES=-DTIMESTEPPER_DIR='"${LIBTIMESTEPPER_DIR}"'
 
+ifeq (true,${gpu-aware-mpi})
+  LIBOGS_DEFINES+= -DGPU_AWARE_MPI
+endif
+
 #.cpp compilation flags
-LIB_CXXFLAGS=${LIBP_DEFINES} ${LIBP_MPICXXFLAGS} ${INCLUDES}
+LIB_CXXFLAGS=${LIBP_DEFINES} ${LIBP_CXXFLAGS} ${INCLUDES}
 
 #object dependancies
 LIB_DEPS=$(wildcard $(LIBP_INCLUDE_DIR)/*.h)  \
@@ -111,6 +114,9 @@ LIBOGS_DEPS=$(wildcard ${LIBP_INCLUDE_DIR}/ogs/*.h)   \
 						$(wildcard ${LIBP_INCLUDE_DIR}/ogs/*.hpp) \
 						${LIB_DEPS}
 
+LIBPARADOGS_DEPS=$(wildcard ${LIBP_INCLUDE_DIR}/parAdogs/*.hpp) \
+								 ${LIB_DEPS}
+
 LIBPARALMOND_DEPS=$(wildcard ${LIBP_INCLUDE_DIR}/parAlmond/*.h)   \
 				  				$(wildcard ${LIBP_INCLUDE_DIR}/parAlmond/*.hpp) \
 								  ${LIB_DEPS}
@@ -118,6 +124,7 @@ LIBPARALMOND_DEPS=$(wildcard ${LIBP_INCLUDE_DIR}/parAlmond/*.h)   \
 LIBCORE_SRC =$(wildcard core/*.cpp)
 LIBLINALG_SRC =$(wildcard linAlg/*.cpp)
 LIBLINEARSOLVER_SRC =$(wildcard linearSolver/*.cpp)
+LIBPARADOGS_SRC =$(wildcard parAdogs/*.cpp)
 LIBMESH_SRC =$(wildcard mesh/*.cpp)
 LIBOGS_SRC =$(wildcard ogs/*.cpp)
 LIBPARALMOND_SRC =$(wildcard parAlmond/*.cpp)
@@ -126,24 +133,27 @@ LIBTIMESTEPPER_SRC =$(wildcard timeStepper/*.cpp)
 LIBCORE_OBJS=$(LIBCORE_SRC:.cpp=.o)
 LIBLINALG_OBJS=$(LIBLINALG_SRC:.cpp=.o)
 LIBLINEARSOLVER_OBJS=$(LIBLINEARSOLVER_SRC:.cpp=.o)
+LIBPARADOGS_OBJS=$(LIBPARADOGS_SRC:.cpp=.o)
 LIBMESH_OBJS=$(LIBMESH_SRC:.cpp=.o)
 LIBOGS_OBJS=$(LIBOGS_SRC:.cpp=.o)
 LIBPARALMOND_OBJS=$(LIBPARALMOND_SRC:.cpp=.o)
 LIBTIMESTEPPER_OBJS=$(LIBTIMESTEPPER_SRC:.cpp=.o)
 
-.PHONY: all core linAlg linearSolver mesh ogs parAlmond timeStepper \
+.PHONY: all core linAlg linearSolver mesh ogs parAdogs parAlmond timeStepper \
 				clean realclean silentUpdate-core             \
 				silentUpdate-linAlg silentUpdate-linearSolver \
 				silentUpdate-ogs silentUpdate-mesh            \
+				silentUpdate-parAdogs                         \
 				silentUpdate-parAlmond silentUpdate-timeStepper
 
-all: core linAlg linearSolver mesh ogs parAlmond timeStepper
+all: core linAlg linearSolver parAdogs mesh ogs parAlmond timeStepper
 
 core: libcore.a silentUpdate-core
 linAlg: liblinAlg.a silentUpdate-linAlg
 linearSolver: liblinearSolver.a silentUpdate-linearSolver
 mesh: libmesh.a silentUpdate-mesh
 ogs: libogs.a silentUpdate-ogs
+parAdogs: libparAdogs.a silentUpdate-parAdogs
 parAlmond: libparAlmond.a silentUpdate-parAlmond
 timeStepper: libtimeStepper.a silentUpdate-timeStepper
 
@@ -155,14 +165,7 @@ else
 	@ar -cr libcore.a $(LIBCORE_OBJS)
 endif
 
-libgs: | libcore.a
-ifneq (,${verbose})
-	${MAKE} -C $(GS_DIR) install verbose=${verbose}
-else
-	@${MAKE} -C $(GS_DIR) install --no-print-directory
-endif
-
-libogs.a: $(LIBOGS_OBJS) | libgs
+libogs.a: $(LIBOGS_OBJS) | libcore.a
 ifneq (,${verbose})
 	ar -cr libogs.a $(LIBOGS_OBJS)
 else
@@ -170,7 +173,7 @@ else
 	@ar -cr libogs.a $(LIBOGS_OBJS)
 endif
 
-liblinAlg.a: $(LIBLINALG_OBJS)
+liblinAlg.a: $(LIBLINALG_OBJS) | libcore.a
 ifneq (,${verbose})
 	ar -cr liblinAlg.a $(LIBLINALG_OBJS)
 else
@@ -178,7 +181,15 @@ else
 	@ar -cr liblinAlg.a $(LIBLINALG_OBJS)
 endif
 
-libmesh.a: $(LIBMESH_OBJS) | liblinAlg.a libogs.a
+libparAdogs.a: $(LIBPARADOGS_OBJS) | liblinAlg.a libogs.a
+ifneq (,${verbose})
+	ar -cr libparAdogs.a $(LIBPARADOGS_OBJS)
+else
+	@printf "%b" "$(LIB_COLOR)Building library $(@F)$(NO_COLOR)\n";
+	@ar -cr libparAdogs.a $(LIBPARADOGS_OBJS)
+endif
+
+libmesh.a: $(LIBMESH_OBJS) | libparAdogs.a
 ifneq (,${verbose})
 	ar -cr libmesh.a $(LIBMESH_OBJS)
 else
@@ -225,6 +236,9 @@ silentUpdate-ogs:
 silentUpdate-mesh:
 	@true
 
+silentUpdate-parAdogs:
+	@true
+
 silentUpdate-parAlmond:
 	@true
 
@@ -237,70 +251,77 @@ ${OCCA_DIR}/lib/libocca.so:
 # rule for .cpp files
 core/%.o: core/%.cpp $(LIB_DEPS) ${OCCA_DIR}/lib/libocca.so
 ifneq (,${verbose})
-	$(LIBP_MPICXX) -o $@ -c $< ${LIBCORE_DEFINES} $(LIB_CXXFLAGS)
+	$(LIBP_CXX) -o $@ -c $< ${LIBCORE_DEFINES} $(LIB_CXXFLAGS)
 else
 	@printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n";
-	@$(LIBP_MPICXX) -o $@ -c $< ${LIBCORE_DEFINES} $(LIB_CXXFLAGS)
+	@$(LIBP_CXX) -o $@ -c $< ${LIBCORE_DEFINES} $(LIB_CXXFLAGS)
 endif
 
 linAlg/%.o: linAlg/%.cpp $(LIB_DEPS) | libcore.a
 ifneq (,${verbose})
-	$(LIBP_MPICXX) -o $@ -c $< ${LIBLINALG_DEFINES} $(LIB_CXXFLAGS)
+	$(LIBP_CXX) -o $@ -c $< ${LIBLINALG_DEFINES} $(LIB_CXXFLAGS)
+else
+	@printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n";
+	@$(LIBP_CXX) -o $@ -c $< ${LIBLINALG_DEFINES} $(LIB_CXXFLAGS)
+endif
+
+parAdogs/%.o: parAdogs/%.cpp $(LIB_DEPS) | libogs.a
+ifneq (,${verbose})
+	$(LIBP_CXX) -o $@ -c $< ${LIBPARADOGS_DEFINES} $(LIB_CXXFLAGS)
 else
 	@printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n";
-	@$(LIBP_MPICXX) -o $@ -c $< ${LIBLINALG_DEFINES} $(LIB_CXXFLAGS)
+	@$(LIBP_CXX) -o $@ -c $< ${LIBPARADOGS_DEFINES} $(LIB_CXXFLAGS)
 endif
 
 linearSolver/%.o: linearSolver/%.cpp $(LIB_DEPS) | libcore.a
 ifneq (,${verbose})
-	$(LIBP_MPICXX) -o $@ -c $< ${LIBLINEARSOLVER_DEFINES} $(LIB_CXXFLAGS)
+	$(LIBP_CXX) -o $@ -c $< ${LIBLINEARSOLVER_DEFINES} $(LIB_CXXFLAGS)
 else
 	@printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n";
-	@$(LIBP_MPICXX) -o $@ -c $< ${LIBLINEARSOLVER_DEFINES} $(LIB_CXXFLAGS)
+	@$(LIBP_CXX) -o $@ -c $< ${LIBLINEARSOLVER_DEFINES} $(LIB_CXXFLAGS)
 endif
 
-mesh/%.o: mesh/%.cpp $(LIBMESH_DEPS) | libogs.a
+mesh/%.o: mesh/%.cpp $(LIBMESH_DEPS) | libparAdogs.a
 ifneq (,${verbose})
-	$(LIBP_MPICXX) -o $@ -c $< ${LIBMESH_DEFINES} $(LIB_CXXFLAGS)
+	$(LIBP_CXX) -o $@ -c $< ${LIBMESH_DEFINES} $(LIB_CXXFLAGS)
 else
 	@printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n";
-	@$(LIBP_MPICXX) -o $@ -c $< ${LIBMESH_DEFINES} $(LIB_CXXFLAGS)
+	@$(LIBP_CXX) -o $@ -c $< ${LIBMESH_DEFINES} $(LIB_CXXFLAGS)
 endif
 
-ogs/%.o: ogs/%.cpp $(LIBOGS_DEPS) | libgs
+ogs/%.o: ogs/%.cpp $(LIBOGS_DEPS) | liblinAlg.a
 ifneq (,${verbose})
-	$(LIBP_MPICXX) -o $@ -c $< ${LIBOGS_DEFINES} -I${GS_DIR}/src $(LIB_CXXFLAGS)
+	$(LIBP_CXX) -o $@ -c $< ${LIBOGS_DEFINES} -I${GS_DIR}/src $(LIB_CXXFLAGS)
 else
 	@printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n";
-	@$(LIBP_MPICXX) -o $@ -c $< ${LIBOGS_DEFINES} -I${GS_DIR}/src $(LIB_CXXFLAGS)
+	@$(LIBP_CXX) -o $@ -c $< ${LIBOGS_DEFINES} -I${GS_DIR}/src $(LIB_CXXFLAGS)
 endif
 
 parAlmond/%.o: parAlmond/%.cpp $(LIBPARALMOND_DEPS) | liblinearSolver.a
 ifneq (,${verbose})
-	$(LIBP_MPICXX) -o $@ -c $< ${LIBPARALMOND_DEFINES} $(LIB_CXXFLAGS)
+	$(LIBP_CXX) -o $@ -c $< ${LIBPARALMOND_DEFINES} $(LIB_CXXFLAGS)
 else
 	@printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n";
-	@$(LIBP_MPICXX) -o $@ -c $< ${LIBPARALMOND_DEFINES} $(LIB_CXXFLAGS)
+	@$(LIBP_CXX) -o $@ -c $< ${LIBPARALMOND_DEFINES} $(LIB_CXXFLAGS)
 endif
 
 timeStepper/%.o: timeStepper/%.cpp $(LIB_DEPS) | libmesh.a
 ifneq (,${verbose})
-	$(LIBP_MPICXX) -o $@ -c $< ${LIBTIMESTEPPER_DEFINES} $(LIB_CXXFLAGS)
+	$(LIBP_CXX) -o $@ -c $< ${LIBTIMESTEPPER_DEFINES} $(LIB_CXXFLAGS)
 else
 	@printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n";
-	@$(LIBP_MPICXX) -o $@ -c $< ${LIBTIMESTEPPER_DEFINES} $(LIB_CXXFLAGS)
+	@$(LIBP_CXX) -o $@ -c $< ${LIBTIMESTEPPER_DEFINES} $(LIB_CXXFLAGS)
 endif
 
 #cleanup
 clean:
-	rm -f core/*.o linAlg/*.o linearSolver/*.o mesh/*.o ogs/*.o parAlmond/*.o timeStepper/*.o *.a
+	rm -f core/*.o linAlg/*.o linearSolver/*.o mesh/*.o ogs/*.o parAdogs/*.o parAlmond/*.o timeStepper/*.o *.a
 
 clean-kernels: clean
-#   $(shell ${OCCA_DIR}/bin/occa clear all -y)
-	rm -rf ~/.occa/
+	rm -rf ${LIBP_DIR}/.occa/
 
 realclean: clean
-	${MAKE} -C ${GS_DIR} clean
+	${MAKE} -C ${OCCA_DIR} clean
 
 help:
 	$(info $(value LIB_HELP_MSG))
@@ -312,4 +333,4 @@ info:
 	$(info LIBP_ARCH = $(LIBP_ARCH))
 	$(info CXXFLAGS  = $(LIB_CXXFLAGS))
 	$(info LIBS      = $(LIBS))
-	@true
\ No newline at end of file
+	@true
diff --git a/libs/mesh/mesh.cpp b/libs/mesh/mesh.cpp
deleted file mode 100644
index 42908cb4d..000000000
--- a/libs/mesh/mesh.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
-#include "mesh/mesh3D.hpp"
-
-//makeing a mesh object requires it to be bound to a device and communicator
-mesh_t::mesh_t(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm):
-  platform(_platform), settings(_settings), comm(_comm) {
-  props = platform.props;
-  MPI_Comm_rank(comm, &rank);
-  MPI_Comm_size(comm, &size);
-}
-
-mesh2D::mesh2D(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm):
-  mesh_t(_platform, _settings, _comm) {}
-
-mesh3D::mesh3D(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm):
-  mesh_t(_platform, _settings, _comm) {}
-
-meshTri2D::meshTri2D(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm):
-  mesh2D(_platform, _settings, _comm) {}
-
-meshQuad2D::meshQuad2D(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm):
-  mesh2D(_platform, _settings, _comm) {}
-
-meshTri3D::meshTri3D(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm):
-  mesh3D(_platform, _settings, _comm) {}
-
-meshQuad3D::meshQuad3D(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm):
-  mesh3D(_platform, _settings, _comm) {}
-
-meshTet3D::meshTet3D(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm):
-  mesh3D(_platform, _settings, _comm) {}
-
-meshHex3D::meshHex3D(platform_t& _platform, meshSettings_t& _settings, MPI_Comm _comm):
-  mesh3D(_platform, _settings, _comm) {}
-
-mesh_t::~mesh_t() {
-  if (halo) halo->Free();
-  if (ringHalo) ringHalo->Free();
-  if (ogs) ogs->Free();
-}
\ No newline at end of file
diff --git a/libs/mesh/meshBasis1D.cpp b/libs/mesh/meshBasis1D.cpp
index bdfabcc6a..f8b9374c2 100644
--- a/libs/mesh/meshBasis1D.cpp
+++ b/libs/mesh/meshBasis1D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim WarburtonTim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,54 +26,66 @@ SOFTWARE.
 
 #include "mesh.hpp"
 
+namespace libp {
+
 // ------------------------------------------------------------------------
 // 1D NODES
 // ------------------------------------------------------------------------
-void mesh_t::Nodes1D(int _N, dfloat *_r){
+void mesh_t::Nodes1D(const int _N, memory<dfloat>& _r){
   JacobiGLL(_N, _r); //Gauss-Legendre-Lobatto nodes
 }
 
-void mesh_t::EquispacedNodes1D(int _N, dfloat *_r){
+void mesh_t::EquispacedNodes1D(const int _N, memory<dfloat>& _r){
   int _Nq = _N+1;
 
   dfloat dr = 2.0/_N;
+
+  _r.malloc(_Nq);
   for (int i=0;i<_Nq;i++) _r[i] = -1.0 + i*dr;
 }
 
 // ------------------------------------------------------------------------
 // ORTHONORMAL BASIS POLYNOMIALS
 // ------------------------------------------------------------------------
-void mesh_t::OrthonormalBasis1D(dfloat a, int i, dfloat *P){
-  *P = JacobiP(a,0,0,i); //Legendre Polynomials
+void mesh_t::OrthonormalBasis1D(const dfloat a, const int i, dfloat& P){
+  P = JacobiP(a,0,0,i); //Legendre Polynomials
 }
 
-void mesh_t::GradOrthonormalBasis1D(dfloat a, int i, dfloat *Pr){
-  *Pr = GradJacobiP(a,0,0,i);
+void mesh_t::GradOrthonormalBasis1D(const dfloat a, const int i, dfloat& Pr){
+  Pr = GradJacobiP(a,0,0,i);
 }
 
 // ------------------------------------------------------------------------
 // 1D VANDERMONDE MATRICES
 // ------------------------------------------------------------------------
-void mesh_t::Vandermonde1D(int _N, int Npoints, dfloat *_r, dfloat *V){
+void mesh_t::Vandermonde1D(const int _N,
+                           const memory<dfloat> _r,
+                           memory<dfloat>& V){
 
-  int _Np = (_N+1);
+  const int _Np = (_N+1);
+  const int Npoints = _r.length();
 
+  V.malloc(Npoints*_Np);
   for(int n=0; n<Npoints; n++){
     for(int i=0; i<_Np; i++){
       int id = n*_Np+i;
-      OrthonormalBasis1D(_r[n], i, V+id);
+      OrthonormalBasis1D(_r[n], i, V[id]);
     }
   }
 }
 
-void mesh_t::GradVandermonde1D(int _N, int Npoints, dfloat *_r, dfloat *Vr){
+void mesh_t::GradVandermonde1D(const int _N,
+                               const memory<dfloat> _r,
+                               memory<dfloat>& Vr){
 
-  int _Np = (_N+1);
+  const int _Np = (_N+1);
+  const int Npoints = _r.length();
 
+  Vr.malloc(Npoints*_Np);
   for(int n=0; n<Npoints; n++){
     for(int i=0; i<_Np; i++){
       int id = n*_Np+i;
-      GradOrthonormalBasis1D(_r[n], i, Vr+id);
+      GradOrthonormalBasis1D(_r[n], i, Vr[id]);
     }
   }
 }
@@ -81,9 +93,12 @@ void mesh_t::GradVandermonde1D(int _N, int Npoints, dfloat *_r, dfloat *Vr){
 // ------------------------------------------------------------------------
 // 1D OPERATOR MATRICES
 // ------------------------------------------------------------------------
-void mesh_t::MassMatrix1D(int _Np, dfloat *V, dfloat *_MM){
+void mesh_t::MassMatrix1D(const int _Np,
+                          const memory<dfloat> V,
+                          memory<dfloat>& _MM){
 
-  // masMatrix = inv(V')*inv(V) = inv(V*V')
+  // massMatrix = inv(V')*inv(V) = inv(V*V')
+  _MM.malloc(_Np*_Np);
   for(int n=0;n<_Np;++n){
     for(int m=0;m<_Np;++m){
       dfloat res = 0;
@@ -93,71 +108,74 @@ void mesh_t::MassMatrix1D(int _Np, dfloat *V, dfloat *_MM){
       _MM[n*_Np + m] = res;
     }
   }
-  matrixInverse(_Np, _MM);
+  linAlg_t::matrixInverse(_Np, _MM);
 }
 
-void mesh_t::Dmatrix1D(int _N, int NpointsIn, dfloat *_rIn,
-                               int NpointsOut, dfloat *_rOut, dfloat *_Dr){
+void mesh_t::Dmatrix1D(const int _N,
+                       const memory<dfloat> _rIn,
+                       const memory<dfloat> _rOut,
+                       memory<dfloat>& _Dr){
 
-  // need NpointsIn = (_N+1)
-  if (NpointsIn != _N+1)
-    LIBP_ABORT(string("Invalid Differentiation operator requested."))
 
-  int _Np = _N+1;
+  const int _Np = _N+1;
+  const int NpointsIn  = _rIn.length();
+  const int NpointsOut = _rOut.length();
 
-  dfloat *V  = (dfloat *) calloc(NpointsIn*_Np, sizeof(dfloat));
-  dfloat *Vr = (dfloat *) calloc(NpointsOut*_Np, sizeof(dfloat));
+  // need NpointsIn = (_N+1)
+  LIBP_ABORT("Invalid Differentiation operator requested.",
+             NpointsIn != _N+1);
 
-  Vandermonde1D(_N, NpointsIn, _rIn, V);
-  GradVandermonde1D(_N, NpointsOut, _rOut, Vr);
+  memory<dfloat> V;
+  memory<dfloat> Vr;
+  Vandermonde1D(_N, _rIn, V);
+  GradVandermonde1D(_N, _rOut, Vr);
 
   //D = Vr/V
-  matrixRightSolve(NpointsOut, _Np, Vr, _Np, _Np, V, _Dr);
-
-  free(V);
-  free(Vr);
+  _Dr.malloc(NpointsOut*_Np);
+  linAlg_t::matrixRightSolve(NpointsOut, _Np, Vr, _Np, _Np, V, _Dr);
 }
 
-void mesh_t::InterpolationMatrix1D(int _N,
-                               int NpointsIn, dfloat *rIn,
-                               int NpointsOut, dfloat *rOut,
-                               dfloat *I){
+void mesh_t::InterpolationMatrix1D(const int _N,
+                                   const memory<dfloat> _rIn,
+                                   const memory<dfloat> _rOut,
+                                   memory<dfloat>& I){
 
-  // need NpointsIn = (_N+1)
-  if (NpointsIn != _N+1)
-    LIBP_ABORT(string("Invalid Interplation operator requested."))
+  const int _Np = _N+1;
+  const int NpointsIn  = _rIn.length();
+  const int NpointsOut = _rOut.length();
 
-  dfloat *VIn = (dfloat*) malloc(NpointsIn*(_N+1)*sizeof(dfloat));
-  dfloat *VOut= (dfloat*) malloc(NpointsOut*(_N+1)*sizeof(dfloat));
-
-  Vandermonde1D(_N, NpointsIn,   rIn, VIn);
-  Vandermonde1D(_N, NpointsOut, rOut, VOut);
+  // need NpointsIn = (_N+1)
+  LIBP_ABORT("Invalid Interplation operator requested.",
+             NpointsIn != _N+1);
 
-  matrixRightSolve(NpointsOut, _N+1, VOut, NpointsIn, _N+1, VIn, I);
+  memory<dfloat> VIn;
+  memory<dfloat> VOut;
+  Vandermonde1D(_N, _rIn, VIn);
+  Vandermonde1D(_N, _rOut, VOut);
 
-  free(VIn); free(VOut);
+  I.malloc(NpointsIn*NpointsOut);
+  linAlg_t::matrixRightSolve(NpointsOut, _Np, VOut,
+                             NpointsIn, _Np, VIn, I);
 }
 
-void mesh_t::DegreeRaiseMatrix1D(int Nc, int Nf, dfloat *P){
-
-  int Nqc = Nc+1;
-  int Nqf = Nf+1;
-
-  dfloat *rc = (dfloat *) malloc(Nqc*sizeof(dfloat));
-  dfloat *rf = (dfloat *) malloc(Nqf*sizeof(dfloat));
+void mesh_t::DegreeRaiseMatrix1D(const int Nc, const int Nf,
+                                 memory<dfloat>& P){
 
+  memory<dfloat> rc;
+  memory<dfloat> rf;
   Nodes1D(Nc, rc);
   Nodes1D(Nf, rf);
 
-  InterpolationMatrix1D(Nc, Nqc, rc, Nqf, rf, P);
-
-  free(rc); free(rf);
+  InterpolationMatrix1D(Nc, rc, rf, P);
 }
 
-void mesh_t::CubatureWeakDmatrix1D(int _Nq, int _cubNq,
-                                     dfloat *_cubProject, dfloat *_cubD, dfloat *_cubPDT){
+void mesh_t::CubatureWeakDmatrix1D(const int _Nq, const int _cubNq,
+                                   const memory<dfloat> _cubProject,
+                                   const memory<dfloat> _cubD,
+                                   memory<dfloat>& _cubPDT){
 
   // cubPDT = cubProject*cubD';
+  _cubPDT.malloc(_Nq*_cubNq);
   for(int n=0;n<_Nq;++n){
     for(int m=0;m<_cubNq;++m){
       _cubPDT[n*_cubNq+m] = 0.0;
@@ -171,29 +189,30 @@ void mesh_t::CubatureWeakDmatrix1D(int _Nq, int _cubNq,
 // ------------------------------------------------------------------------
 // 1D JACOBI POLYNOMIALS
 // ------------------------------------------------------------------------
-static dfloat mygamma(dfloat x){
+static dfloat mygamma(const dfloat x){
   dfloat lgam = lgamma(x);
   dfloat gam  = signgam*exp(lgam);
   return gam;
 }
 
-dfloat mesh_t::JacobiP(dfloat a, dfloat alpha, dfloat beta, int _N){
+dfloat mesh_t::JacobiP(const dfloat a, const dfloat alpha,
+                       const dfloat beta, const int _N){
 
-  dfloat ax = a;
+  const dfloat ax = a;
 
-  dfloat *P = (dfloat *) calloc((_N+1), sizeof(dfloat));
+  memory<dfloat> P(_N+1);
 
   // Zero order
-  dfloat gamma0 = pow(2,(alpha+beta+1))/(alpha+beta+1)*mygamma(1+alpha)*mygamma(1+beta)/mygamma(1+alpha+beta);
-  dfloat p0     = 1.0/sqrt(gamma0);
+  const dfloat gamma0 = pow(2,(alpha+beta+1))/(alpha+beta+1)*mygamma(1+alpha)*mygamma(1+beta)/mygamma(1+alpha+beta);
+  const dfloat p0     = 1.0/sqrt(gamma0);
 
-  if (_N==0){ free(P); return p0;}
+  if (_N==0){ return p0;}
   P[0] = p0;
 
   // first order
-  dfloat gamma1 = (alpha+1)*(beta+1)/(alpha+beta+3)*gamma0;
-  dfloat p1     = ((alpha+beta+2)*ax/2 + (alpha-beta)/2)/sqrt(gamma1);
-  if (_N==1){free(P); return p1;}
+  const dfloat gamma1 = (alpha+1)*(beta+1)/(alpha+beta+3)*gamma0;
+  const dfloat p1     = ((alpha+beta+2)*ax/2 + (alpha-beta)/2)/sqrt(gamma1);
+  if (_N==1){ return p1;}
 
   P[1] = p1;
 
@@ -207,13 +226,11 @@ dfloat mesh_t::JacobiP(dfloat a, dfloat alpha, dfloat beta, int _N){
     P[i+1] = 1./anew*( -aold*P[i-1] + (ax-bnew)*P[i]);
     aold =anew;
   }
-
-  dfloat pN = P[_N];
-  free(P);
-  return pN;
+  return P[_N];
 }
 
-dfloat mesh_t::GradJacobiP(dfloat a, dfloat alpha, dfloat beta, int _N){
+dfloat mesh_t::GradJacobiP(const dfloat a, const dfloat alpha,
+                           const dfloat beta, const int _N){
 
   dfloat PNr = 0;
 
@@ -226,53 +243,74 @@ dfloat mesh_t::GradJacobiP(dfloat a, dfloat alpha, dfloat beta, int _N){
 // ------------------------------------------------------------------------
 // 1D GAUSS-LEGENDRE-LOBATTO QUADRATURE
 // ------------------------------------------------------------------------
-void mesh_t::JacobiGLL(int _N, dfloat *_x, dfloat *_w){
+void mesh_t::JacobiGLL(const int _N, memory<dfloat>& _x){
+
+  _x.malloc(_N+1);
 
   _x[0] = -1.;
   _x[_N] =  1.;
 
   if(_N>1){
-    dfloat *wtmp = (dfloat*) calloc(_N-1, sizeof(dfloat));
-    JacobiGQ(1,1, _N-2, _x+1, wtmp);
-    free(wtmp);
+    memory<dfloat> wtmp;
+    memory<dfloat> xp1 = _x + 1;
+    JacobiGQ(1,1, _N-2, xp1, wtmp);
   }
+}
 
-  if (_w!=NULL) {
-    int _Np = _N+1;
-    dfloat *_MM = (dfloat*) malloc(_Np*_Np*sizeof(dfloat));
-    dfloat  *V = (dfloat*) malloc(_Np*_Np*sizeof(dfloat));
+void mesh_t::JacobiGLL(const int _N,
+                       memory<dfloat>& _x,
+                       memory<dfloat>& _w){
 
-    Vandermonde1D(_N, _N+1, _x, V);
-    MassMatrix1D(_N+1, V, _MM);
+  _x.malloc(_N+1);
+  _w.malloc(_N+1);
 
-    // use weights from mass lumping
-    for(int n=0;n<=_N;++n){
-      dfloat res = 0;
-      for(int m=0;m<=_N;++m){
-        res += _MM[n*(_N+1)+m];
-      }
-      _w[n] = res;
+  _x[0] = -1.;
+  _x[_N] =  1.;
+
+  if(_N>1){
+    memory<dfloat> wtmp;
+    memory<dfloat> xp1 = _x + 1;
+    JacobiGQ(1,1, _N-2, xp1, wtmp);
+  }
+
+  memory<dfloat> V;
+  memory<dfloat> _MM;
+  Vandermonde1D(_N, _x, V);
+  MassMatrix1D(_N+1, V, _MM);
+
+  // use weights from mass lumping
+  for(int n=0;n<=_N;++n){
+    dfloat res = 0;
+    for(int m=0;m<=_N;++m){
+      res += _MM[n*(_N+1)+m];
     }
+    _w[n] = res;
   }
 }
 
 // ------------------------------------------------------------------------
 // 1D GAUSS QUADRATURE
 // ------------------------------------------------------------------------
-void mesh_t::JacobiGQ(dfloat alpha, dfloat beta, int _N, dfloat *_x, dfloat *_w){
+void mesh_t::JacobiGQ(const dfloat alpha, const dfloat beta,
+                      const int _N,
+                      memory<dfloat>& _x,
+                      memory<dfloat>& _w){
 
   // function NGQ = JacobiGQ(alpha,beta,_N, _x, _w)
   // Purpose: Compute the _N'th order Gauss quadrature points, _x,
   //          and weights, _w, associated with the Jacobi
   //          polynomial, of type (alpha,beta) > -1 ( <> -0.5).
+  if (_x.length()==0) _x.malloc(_N+1);
+  if (_w.length()==0) _w.malloc(_N+1);
+
   if (_N==0){
     _x[0] = (alpha-beta)/(alpha+beta+2);
     _w[0] = 2;
   }
 
   // Form symmetric matrix from recurrence.
-  dfloat *J = (dfloat*) calloc((_N+1)*(_N+1), sizeof(dfloat));
-  dfloat *h1 = (dfloat*) calloc(_N+1, sizeof(dfloat));
+  memory<dfloat> J((_N+1)*(_N+1), 0.0);
+  memory<dfloat> h1(_N+1);
 
   for(int n=0;n<=_N;++n){
     h1[n] = 2*n+alpha+beta;
@@ -301,12 +339,11 @@ void mesh_t::JacobiGQ(dfloat alpha, dfloat beta, int _N, dfloat *_x, dfloat *_w)
   // Compute quadrature by eigenvalue solve
 
   //  [V,D] = eig(J);
-  dfloat *WR = (dfloat*) calloc(_N+1, sizeof(dfloat));
-  dfloat *WI = (dfloat*) calloc(_N+1, sizeof(dfloat));
-  dfloat *VR = (dfloat*) calloc((_N+1)*(_N+1), sizeof(dfloat));
+  memory<dfloat> WI(_N+1);
+  memory<dfloat> VR((_N+1)*(_N+1));
 
   // _x = diag(D);
-  matrixEigenVectors(_N+1, J, VR, _x, WI);
+  linAlg_t::matrixEigenVectors(_N+1, J, VR, _x, WI);
 
   //_w = (V(1,:)').^2*2^(alpha+beta+1)/(alpha+beta+1)*gamma(alpha+1)*.gamma(beta+1)/gamma(alpha+beta+1);
   for(int n=0;n<=_N;++n){
@@ -332,10 +369,6 @@ void mesh_t::JacobiGQ(dfloat alpha, dfloat beta, int _N, dfloat *_x, dfloat *_w)
     printf("zgl[%d] = % e, wgl[%d] = % e\n", n, _x[0][n], n, _w[0][n]);
   }
 #endif
-
-  free(WR);
-  free(WI);
-  free(VR);
 }
 
 /*
@@ -483,3 +516,5 @@ void meshCubatureWeakDmatrices1D(int _N, int _Np, dfloat *V,
   free(cubVr);
 }
 */
+
+} //namespace libp
diff --git a/libs/mesh/meshBasisHex3D.cpp b/libs/mesh/meshBasisHex3D.cpp
index 696ecf6e2..361e9f8eb 100644
--- a/libs/mesh/meshBasisHex3D.cpp
+++ b/libs/mesh/meshBasisHex3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim WarburtonTim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,17 +25,26 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
+
+namespace libp {
 
 // ------------------------------------------------------------------------
 // HEX 3D NODES
 // ------------------------------------------------------------------------
-void mesh_t::NodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t){
-  int _Nq = _N+1;
-
-  dfloat *r1D = (dfloat*) malloc(_Nq*sizeof(dfloat));
+void mesh_t::NodesHex3D(const int _N,
+                        memory<dfloat>& _r,
+                        memory<dfloat>& _s,
+                        memory<dfloat>& _t){
+  const int _Nq = _N+1;
+  const int _Np = _Nq*_Nq*_Nq;
+
+  memory<dfloat> r1D;
   JacobiGLL(_N, r1D); //Gauss-Legendre-Lobatto nodes
 
+  _r.malloc(_Np);
+  _s.malloc(_Np);
+  _t.malloc(_Np);
+
   //Tensor product
   for (int k=0;k<_Nq;k++) {
     for (int j=0;j<_Nq;j++) {
@@ -46,11 +55,13 @@ void mesh_t::NodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t){
       }
     }
   }
-
-  free(r1D);
 }
 
-void mesh_t::FaceNodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_faceNodes){
+void mesh_t::FaceNodesHex3D(const int _N,
+                            const memory<dfloat> _r,
+                            const memory<dfloat> _s,
+                            const memory<dfloat> _t,
+                            memory<int>& _faceNodes){
   int _Nq = _N+1;
   int _Nfp = _Nq*_Nq;
   int _Np = _Nq*_Nq*_Nq;
@@ -64,25 +75,30 @@ void mesh_t::FaceNodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_fa
 
   const dfloat NODETOL = 1000.*deps;
 
+  _faceNodes.malloc(6*_Nfp);
   for (int n=0;n<_Np;n++) {
-    if(fabs(_t[n]+1)<NODETOL)
+    if(std::abs(_t[n]+1)<NODETOL)
       _faceNodes[0*_Nfp+(cnt[0]++)] = n;
-    if(fabs(_s[n]+1)<NODETOL)
+    if(std::abs(_s[n]+1)<NODETOL)
       _faceNodes[1*_Nfp+(cnt[1]++)] = n;
-    if(fabs(_r[n]-1)<NODETOL)
+    if(std::abs(_r[n]-1)<NODETOL)
       _faceNodes[2*_Nfp+(cnt[2]++)] = n;
-    if(fabs(_s[n]-1)<NODETOL)
+    if(std::abs(_s[n]-1)<NODETOL)
       _faceNodes[3*_Nfp+(cnt[3]++)] = n;
-    if(fabs(_r[n]+1)<NODETOL)
+    if(std::abs(_r[n]+1)<NODETOL)
       _faceNodes[4*_Nfp+(cnt[4]++)] = n;
-    if(fabs(_t[n]-1)<NODETOL)
+    if(std::abs(_t[n]-1)<NODETOL)
       _faceNodes[5*_Nfp+(cnt[5]++)] = n;
   }
 }
 
-void mesh_t::VertexNodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_vertexNodes){
-  int _Nq = _N+1;
-  int _Np = _Nq*_Nq*_Nq;
+void mesh_t::VertexNodesHex3D(const int _N,
+                              const memory<dfloat> _r,
+                              const memory<dfloat> _s,
+                              const memory<dfloat> _t,
+                              memory<int>& _vertexNodes){
+  const int _Nq = _N+1;
+  const int _Np = _Nq*_Nq*_Nq;
 
   dfloat deps = 1.;
   while((1.+deps)>1.)
@@ -90,6 +106,7 @@ void mesh_t::VertexNodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_
 
   const dfloat NODETOL = 1000.*deps;
 
+  _vertexNodes.malloc(8);
   for(int n=0;n<_Np;++n){
     if( (_r[n]+1)*(_r[n]+1)+(_s[n]+1)*(_s[n]+1)+(_t[n]+1)*(_t[n]+1)<NODETOL)
       _vertexNodes[0] = n;
@@ -110,15 +127,173 @@ void mesh_t::VertexNodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_
   }
 }
 
-void mesh_t::EquispacedNodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t){
-  int _Nq = _N+1;
+/*Find a matching array between nodes on matching faces */
+void mesh_t::FaceNodeMatchingHex3D(const memory<dfloat> _r,
+                                   const memory<dfloat> _s,
+                                   const memory<dfloat> _t,
+                                   const memory<int> _faceNodes,
+                                   const memory<int> _faceVertices,
+                                   memory<int>& R){
+
+  const int _Nfaces = 6;
+  const int _Nverts = 8;
+  const int _NfaceVertices = 4;
+
+  const int _Nfp = _faceNodes.length()/_Nfaces;
+
+  const dfloat NODETOL = 1.0e-5;
+
+  dfloat V0[4][2] = {{-1.0,-1.0},{ 1.0,-1.0},{ 1.0, 1.0},{-1.0, 1.0}};
+  dfloat V1[4][2] = {{-1.0,-1.0},{-1.0, 1.0},{ 1.0, 1.0},{ 1.0,-1.0}};
+
+  dfloat EX0[_Nverts], EY0[_Nverts];
+  dfloat EX1[_Nverts], EY1[_Nverts];
+
+  memory<dfloat> x0(_Nfp);
+  memory<dfloat> y0(_Nfp);
+
+  memory<dfloat> x1(_Nfp);
+  memory<dfloat> y1(_Nfp);
+
+  R.malloc(_Nfaces*_Nfaces*_NfaceVertices*_Nfp);
+
+  for (int fM=0;fM<_Nfaces;fM++) {
+
+    for (int v=0;v<_Nverts;v++) {
+      EX0[v] = 0.0; EY0[v] = 0.0;
+    }
+    //setup top element with face fM on the bottom
+    for (int v=0;v<_NfaceVertices;v++) {
+      int fv = _faceVertices[fM*_NfaceVertices + v];
+      EX0[fv] = V0[v][0]; EY0[fv] = V0[v][1];
+    }
+
+    for(int n=0;n<_Nfp;++n){ /* for each face node */
+      const int fn = _faceNodes[fM*_Nfp+n];
+
+      /* (r,s,t) coordinates of interpolation nodes*/
+      dfloat rn = _r[fn];
+      dfloat sn = _s[fn];
+      dfloat tn = _t[fn];
+
+      /* physical coordinate of interpolation node */
+      x0[n] =
+        +0.125*(1-rn)*(1-sn)*(1-tn)*EX0[0]
+        +0.125*(1+rn)*(1-sn)*(1-tn)*EX0[1]
+        +0.125*(1+rn)*(1+sn)*(1-tn)*EX0[2]
+        +0.125*(1-rn)*(1+sn)*(1-tn)*EX0[3]
+        +0.125*(1-rn)*(1-sn)*(1+tn)*EX0[4]
+        +0.125*(1+rn)*(1-sn)*(1+tn)*EX0[5]
+        +0.125*(1+rn)*(1+sn)*(1+tn)*EX0[6]
+        +0.125*(1-rn)*(1+sn)*(1+tn)*EX0[7];
+
+      y0[n] =
+        +0.125*(1-rn)*(1-sn)*(1-tn)*EY0[0]
+        +0.125*(1+rn)*(1-sn)*(1-tn)*EY0[1]
+        +0.125*(1+rn)*(1+sn)*(1-tn)*EY0[2]
+        +0.125*(1-rn)*(1+sn)*(1-tn)*EY0[3]
+        +0.125*(1-rn)*(1-sn)*(1+tn)*EY0[4]
+        +0.125*(1+rn)*(1-sn)*(1+tn)*EY0[5]
+        +0.125*(1+rn)*(1+sn)*(1+tn)*EY0[6]
+        +0.125*(1-rn)*(1+sn)*(1+tn)*EY0[7];
+    }
+
+    for (int fP=0;fP<_Nfaces;fP++) { /*For each neighbor face */
+      for (int rot=0;rot<_NfaceVertices;rot++) { /* For each face rotation */
+        // Zero vertices
+        for (int v=0;v<_Nverts;v++) {
+          EX1[v] = 0.0; EY1[v] = 0.0;
+        }
+        //setup bottom element with face fP on the top
+        for (int v=0;v<_NfaceVertices;v++) {
+          int fv = _faceVertices[fP*_NfaceVertices + ((v+rot)%_NfaceVertices)];
+          EX1[fv] = V1[v][0]; EY1[fv] = V1[v][1];
+        }
+
+        for(int n=0;n<_Nfp;++n){ /* for each node */
+          const int fn = _faceNodes[fP*_Nfp+n];
+
+          /* (r,s,t) coordinates of interpolation nodes*/
+          dfloat rn = _r[fn];
+          dfloat sn = _s[fn];
+          dfloat tn = _t[fn];
+
+          /* physical coordinate of interpolation node */
+          x1[n] =  0.125*(1-rn)*(1-sn)*(1-tn)*EX1[0]
+                  +0.125*(1+rn)*(1-sn)*(1-tn)*EX1[1]
+                  +0.125*(1+rn)*(1+sn)*(1-tn)*EX1[2]
+                  +0.125*(1-rn)*(1+sn)*(1-tn)*EX1[3]
+                  +0.125*(1-rn)*(1-sn)*(1+tn)*EX1[4]
+                  +0.125*(1+rn)*(1-sn)*(1+tn)*EX1[5]
+                  +0.125*(1+rn)*(1+sn)*(1+tn)*EX1[6]
+                  +0.125*(1-rn)*(1+sn)*(1+tn)*EX1[7];
+
+          y1[n] =  0.125*(1-rn)*(1-sn)*(1-tn)*EY1[0]
+                  +0.125*(1+rn)*(1-sn)*(1-tn)*EY1[1]
+                  +0.125*(1+rn)*(1+sn)*(1-tn)*EY1[2]
+                  +0.125*(1-rn)*(1+sn)*(1-tn)*EY1[3]
+                  +0.125*(1-rn)*(1-sn)*(1+tn)*EY1[4]
+                  +0.125*(1+rn)*(1-sn)*(1+tn)*EY1[5]
+                  +0.125*(1+rn)*(1+sn)*(1+tn)*EY1[6]
+                  +0.125*(1-rn)*(1+sn)*(1+tn)*EY1[7];
+        }
+
+        /* for each node on this face find the neighbor node */
+        for(int n=0;n<_Nfp;++n){
+          const dfloat xM = x0[n];
+          const dfloat yM = y0[n];
+
+          int m=0;
+          for(;m<_Nfp;++m){ /* for each neighbor node */
+            const dfloat xP = x1[m];
+            const dfloat yP = y1[m];
+
+            /* distance between target and neighbor node */
+            const dfloat dist = pow(xM-xP,2) + pow(yM-yP,2);
+
+            /* if neighbor node is close to target, match */
+            if(dist<NODETOL){
+              R[fM*_Nfaces*_NfaceVertices*_Nfp
+                + fP*_NfaceVertices*_Nfp
+                + rot*_Nfp + n] = m;
+              break;
+            }
+          }
+
+          /*Check*/
+          const dfloat xP = x1[m];
+          const dfloat yP = y1[m];
+
+          /* distance between target and neighbor node */
+          const dfloat dist = pow(xM-xP,2) + pow(yM-yP,2);
+          //This shouldn't happen
+          LIBP_ABORT("Unable to match face node, face: " << fM
+                     << ", matching face: " << fP
+                     << ", rotation: " << rot
+                     << ", node: " << n
+                     << ". Is the reference node set not symmetric?",
+                     dist>NODETOL);
+        }
+      }
+    }
+  }
+}
+
+void mesh_t::EquispacedNodesHex3D(const int _N,
+                                  memory<dfloat>& _r,
+                                  memory<dfloat>& _s,
+                                  memory<dfloat>& _t){
+  const int _Nq = _N+1;
+  const int _Np = _Nq*_Nq*_Nq;
 
   //Equispaced 1D nodes
-  dfloat *r1D = (dfloat*) malloc(_Nq*sizeof(dfloat));
-  dfloat dr = 2.0/_N;
-  for (int i=0;i<_Nq;i++) r1D[i] = -1.0 + i*dr;
+  memory<dfloat> r1D;
+  EquispacedNodes1D(_N, r1D);
 
   //Tensor product
+  _r.malloc(_Np);
+  _s.malloc(_Np);
+  _t.malloc(_Np);
   for (int k=0;k<_Nq;k++) {
     for (int j=0;j<_Nq;j++) {
       for (int i=0;i<_Nq;i++) {
@@ -128,13 +303,14 @@ void mesh_t::EquispacedNodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t){
       }
     }
   }
-
-  free(r1D);
 }
 
-void mesh_t::EquispacedEToVHex3D(int _N, int *_EToV){
-  int _Nq = _N+1;
-  int _Nverts = 4;
+void mesh_t::EquispacedEToVHex3D(const int _N, memory<int>& _EToV){
+  const int _Nq = _N+1;
+  const int _Nelements = 6*_N*_N*_N;
+  const int _Nverts = 4;
+
+  _EToV.malloc(_Nelements*_Nverts);
 
   //Tensor product
   int cnt=0;
@@ -187,9 +363,12 @@ void mesh_t::EquispacedEToVHex3D(int _N, int *_EToV){
   }
 }
 
-void mesh_t::SEMFEMEToVHex3D(int _N, int *_EToV){
-  int _Nq = _N+1;
-  int _Nverts = 8;
+void mesh_t::SEMFEMEToVHex3D(const int _N, memory<int>& _EToV){
+  const int _Nq = _N+1;
+  const int _Nelements = _N*_N*_N;
+  const int _Nverts = 8;
+
+  _EToV.malloc(_Nelements*_Nverts);
 
   //Tensor product
   int cnt=0;
@@ -213,48 +392,68 @@ void mesh_t::SEMFEMEToVHex3D(int _N, int *_EToV){
 // ------------------------------------------------------------------------
 // ORTHONORMAL BASIS POLYNOMIALS
 // ------------------------------------------------------------------------
-void mesh_t::OrthonormalBasisHex3D(dfloat a, dfloat b, dfloat c, int i, int j, int k, dfloat *P){
-  *P = JacobiP(a,0,0,i)*JacobiP(b,0,0,j)*JacobiP(c,0,0,k);
+void mesh_t::OrthonormalBasisHex3D(const dfloat a, const dfloat b, const dfloat c,
+                                   const int i, const int j, const int k,
+                                   dfloat& P){
+  P = JacobiP(a,0,0,i)*JacobiP(b,0,0,j)*JacobiP(c,0,0,k);
 }
 
-void mesh_t::GradOrthonormalBasisHex3D(dfloat a, dfloat b, dfloat c, int i, int j, int k, dfloat *Pr, dfloat *Ps, dfloat *Pt){
-  *Pr = GradJacobiP(a,0,0,i)*JacobiP(b,0,0,j)*JacobiP(c,0,0,k);
-  *Ps = JacobiP(a,0,0,i)*GradJacobiP(b,0,0,j)*JacobiP(c,0,0,k);
-  *Pt = JacobiP(a,0,0,i)*JacobiP(b,0,0,j)*GradJacobiP(c,0,0,k);
+void mesh_t::GradOrthonormalBasisHex3D(const dfloat a, const dfloat b, const dfloat c,
+                                       const int i, const int j, const int k,
+                                       dfloat& Pr, dfloat& Ps, dfloat& Pt){
+  Pr = GradJacobiP(a,0,0,i)*JacobiP(b,0,0,j)*JacobiP(c,0,0,k);
+  Ps = JacobiP(a,0,0,i)*GradJacobiP(b,0,0,j)*JacobiP(c,0,0,k);
+  Pt = JacobiP(a,0,0,i)*JacobiP(b,0,0,j)*GradJacobiP(c,0,0,k);
 }
 
 // ------------------------------------------------------------------------
 // 2D VANDERMONDE MATRICES
 // ------------------------------------------------------------------------
 
-void mesh_t::VandermondeHex3D(int _N, int Npoints, dfloat *_r, dfloat *_s, dfloat *_t, dfloat *V){
+void mesh_t::VandermondeHex3D(const int _N,
+                              const memory<dfloat> _r,
+                              const memory<dfloat> _s,
+                              const memory<dfloat> _t,
+                              memory<dfloat>& V){
 
-  int _Nq = _N+1;
-  int _Np = _Nq*_Nq*_Nq;
+  const int _Nq = _N+1;
+  const int _Np = _Nq*_Nq*_Nq;
+  const int Npoints = _r.length();
 
+  V.malloc(Npoints*_Np);
   for(int n=0; n<Npoints; n++){
     for(int k=0; k<_Nq; k++){
       for(int j=0; j<_Nq; j++){
         for(int i=0; i<_Nq; i++){
           int id = n*_Np+i+j*_Nq+k*_Nq*_Nq;
-          OrthonormalBasisHex3D(_r[n], _s[n], _t[n], i, j, k, V+id);
+          OrthonormalBasisHex3D(_r[n], _s[n], _t[n], i, j, k, V[id]);
         }
       }
     }
   }
 }
 
-void mesh_t::GradVandermondeHex3D(int _N, int Npoints, dfloat *_r, dfloat *_s, dfloat *_t, dfloat *Vr, dfloat *Vs, dfloat *Vt){
-
-  int _Nq = _N+1;
-  int _Np = _Nq*_Nq*_Nq;
-
+void mesh_t::GradVandermondeHex3D(const int _N,
+                                  const memory<dfloat> _r,
+                                  const memory<dfloat> _s,
+                                  const memory<dfloat> _t,
+                                  memory<dfloat>& Vr,
+                                  memory<dfloat>& Vs,
+                                  memory<dfloat>& Vt){
+
+  const int _Nq = _N+1;
+  const int _Np = _Nq*_Nq*_Nq;
+  const int Npoints = _r.length();
+
+  Vr.malloc(Npoints*_Np);
+  Vs.malloc(Npoints*_Np);
+  Vt.malloc(Npoints*_Np);
   for(int n=0; n<Npoints; n++){
     for(int k=0; k<_Nq; k++){
       for(int j=0; j<_Nq; j++){
         for(int i=0; i<_Nq; i++){
           int id = n*_Np+i+j*_Nq+k*_Nq*_Nq;
-          GradOrthonormalBasisHex3D(_r[n], _s[n], _t[n], i, j, k, Vr+id, Vs+id, Vt+id);
+          GradOrthonormalBasisHex3D(_r[n], _s[n], _t[n], i, j, k, Vr[id], Vs[id], Vt[id]);
         }
       }
     }
@@ -264,9 +463,12 @@ void mesh_t::GradVandermondeHex3D(int _N, int Npoints, dfloat *_r, dfloat *_s, d
 // ------------------------------------------------------------------------
 // 2D OPERATOR MATRICES
 // ------------------------------------------------------------------------
-void mesh_t::MassMatrixHex3D(int _Np, dfloat *V, dfloat *_MM){
+void mesh_t::MassMatrixHex3D(const int _Np,
+                             const memory<dfloat> V,
+                             memory<dfloat>& _MM){
 
-  // masMatrix = inv(V')*inv(V) = inv(V*V')
+  // massMatrix = inv(V')*inv(V) = inv(V*V')
+  _MM.malloc(_Np*_Np);
   for(int n=0;n<_Np;++n){
     for(int m=0;m<_Np;++m){
       dfloat res = 0;
@@ -276,15 +478,18 @@ void mesh_t::MassMatrixHex3D(int _Np, dfloat *V, dfloat *_MM){
       _MM[n*_Np + m] = res;
     }
   }
-  matrixInverse(_Np, _MM);
+  linAlg_t::matrixInverse(_Np, _MM);
 }
 
-void mesh_t::LumpedMassMatrixHex3D(int _N, dfloat *_gllw, dfloat *_MM){
+void mesh_t::LumpedMassMatrixHex3D(const int _N,
+                                   const memory<dfloat> _gllw,
+                                   memory<dfloat>& _MM){
 
-  int _Nq = _N+1;
-  int _Np = _Nq*_Nq*_Nq;
+  const int _Nq = _N+1;
+  const int _Np = _Nq*_Nq*_Nq;
 
   // LumpedMassMatrix = gllw \ctimes gllw \ctimes gllw
+  _MM.malloc(_Np*_Np, 0.0);
   for(int k=0;k<_Nq;++k){
     for(int n=0;n<_Nq;++n){
       for(int m=0;m<_Nq;++m){
@@ -295,12 +500,15 @@ void mesh_t::LumpedMassMatrixHex3D(int _N, dfloat *_gllw, dfloat *_MM){
   }
 }
 
-void mesh_t::invLumpedMassMatrixHex3D(int _N, dfloat *_gllw, dfloat *_invMM){
+void mesh_t::invLumpedMassMatrixHex3D(const int _N,
+                                      const memory<dfloat> _gllw,
+                                      memory<dfloat>& _invMM){
 
-  int _Nq = _N+1;
-  int _Np = _Nq*_Nq*_Nq;
+  const int _Nq = _N+1;
+  const int _Np = _Nq*_Nq*_Nq;
 
   // invLumpedMassMatrix = invgllw \ctimes invgllw
+  _invMM.malloc(_Np*_Np, 0.0);
   for(int k=0;k<_Nq;++k){
     for(int n=0;n<_Nq;++n){
       for(int m=0;m<_Nq;++m){
@@ -311,47 +519,56 @@ void mesh_t::invLumpedMassMatrixHex3D(int _N, dfloat *_gllw, dfloat *_invMM){
   }
 }
 
-void mesh_t::DmatrixHex3D(int _N, int Npoints, dfloat *_r, dfloat *_s, dfloat *_t,
-                                                dfloat *_Dr, dfloat *_Ds, dfloat *_Dt){
-
-  int _Nq = _N+1;
-  int _Np = _Nq*_Nq*_Nq;
+void mesh_t::DmatrixHex3D(const int _N,
+                          const memory<dfloat> _r,
+                          const memory<dfloat> _s,
+                          const memory<dfloat> _t,
+                          memory<dfloat>& _D){
 
-  dfloat *V  = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat));
-  dfloat *Vr = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat));
-  dfloat *Vs = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat));
-  dfloat *Vt = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat));
+  const int _Nq = _N+1;
+  const int _Np = _Nq*_Nq*_Nq;
 
-  VandermondeHex3D(_N, Npoints, _r, _s, _t, V);
-  GradVandermondeHex3D(_N, Npoints, _r, _s, _t, Vr, Vs, Vt);
+  memory<dfloat> V, Vr, Vs, Vt;
+  VandermondeHex3D(_N, _r, _s, _t, V);
+  GradVandermondeHex3D(_N, _r, _s, _t, Vr, Vs, Vt);
 
   //Dr = Vr/V, Ds = Vs/V, Dt = Vt/V
-  matrixRightSolve(_Np, _Np, Vr, _Np, _Np, V, _Dr);
-  matrixRightSolve(_Np, _Np, Vs, _Np, _Np, V, _Ds);
-  matrixRightSolve(_Np, _Np, Vt, _Np, _Np, V, _Dt);
-
-  free(V); free(Vr); free(Vs); free(Vt);
+  _D.malloc(3*_Np*_Np);
+  memory<dfloat> _Dr = _D + 0*_Np*_Np;
+  memory<dfloat> _Ds = _D + 1*_Np*_Np;
+  memory<dfloat> _Dt = _D + 2*_Np*_Np;
+  linAlg_t::matrixRightSolve(_Np, _Np, Vr, _Np, _Np, V, _Dr);
+  linAlg_t::matrixRightSolve(_Np, _Np, Vs, _Np, _Np, V, _Ds);
+  linAlg_t::matrixRightSolve(_Np, _Np, Vt, _Np, _Np, V, _Dt);
 }
 
-void mesh_t::InterpolationMatrixHex3D(int _N,
-                               int NpointsIn, dfloat *rIn, dfloat *sIn, dfloat *tIn,
-                               int NpointsOut, dfloat *rOut, dfloat *sOut, dfloat *tOut,
-                               dfloat *I){
+void mesh_t::InterpolationMatrixHex3D(const int _N,
+                                      const memory<dfloat> rIn,
+                                      const memory<dfloat> sIn,
+                                      const memory<dfloat> tIn,
+                                      const memory<dfloat> rOut,
+                                      const memory<dfloat> sOut,
+                                      const memory<dfloat> tOut,
+                                      memory<dfloat>& I){
 
-  int _Nq = _N+1;
-  int _Np = _Nq*_Nq*_Nq;
+  const int _Nq = _N+1;
+  const int _Np = _Nq*_Nq*_Nq;
 
-  // need NpointsIn = _Np
-  if (NpointsIn != _Np)
-    LIBP_ABORT(string("Invalid Interplation operator requested."))
+  const int NpointsIn  = rIn.length();
+  const int NpointsOut = rOut.length();
 
-  dfloat *VIn = (dfloat*) malloc(NpointsIn*_Np*sizeof(dfloat));
-  dfloat *VOut= (dfloat*) malloc(NpointsOut*_Np*sizeof(dfloat));
+  // need NpointsIn = _Np
+  LIBP_ABORT("Invalid Interplation operator requested.",
+             NpointsIn != _Np);
 
-  VandermondeHex3D(_N, NpointsIn,   rIn, sIn, tIn, VIn);
-  VandermondeHex3D(_N, NpointsOut, rOut, sOut, tOut, VOut);
+  memory<dfloat> VIn;
+  memory<dfloat> VOut;
+  VandermondeHex3D(_N, rIn, sIn, tIn, VIn);
+  VandermondeHex3D(_N, rOut, sOut, tOut, VOut);
 
-  matrixRightSolve(NpointsOut, _Np, VOut, NpointsIn, _Np, VIn, I);
+  I.malloc(NpointsIn*NpointsOut);
+  linAlg_t::matrixRightSolve(NpointsOut, _Np, VOut,
+                             NpointsIn, _Np, VIn, I);
+}
 
-  free(VIn); free(VOut);
-}
\ No newline at end of file
+} //namespace libp
diff --git a/libs/mesh/meshBasisQuad2D.cpp b/libs/mesh/meshBasisQuad2D.cpp
index bba5f10ec..932fd1460 100644
--- a/libs/mesh/meshBasisQuad2D.cpp
+++ b/libs/mesh/meshBasisQuad2D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim WarburtonTim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,17 +25,24 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
+
+namespace libp {
 
 // ------------------------------------------------------------------------
 // QUAD 2D NODES
 // ------------------------------------------------------------------------
-void mesh_t::NodesQuad2D(int _N, dfloat *_r, dfloat *_s){
-  int _Nq = _N+1;
+void mesh_t::NodesQuad2D(const int _N,
+                         memory<dfloat>& _r,
+                         memory<dfloat>& _s){
+  const int _Nq = _N+1;
+  const int _Np = _Nq*_Nq;
 
-  dfloat *r1D = (dfloat*) malloc(_Nq*sizeof(dfloat));
+  memory<dfloat> r1D;
   JacobiGLL(_N, r1D); //Gauss-Legendre-Lobatto nodes
 
+  _r.malloc(_Np);
+  _s.malloc(_Np);
+
   //Tensor product
   for (int j=0;j<_Nq;j++) {
     for (int i=0;i<_Nq;i++) {
@@ -43,14 +50,15 @@ void mesh_t::NodesQuad2D(int _N, dfloat *_r, dfloat *_s){
       _s[i+j*_Nq] = r1D[j];
     }
   }
-
-  free(r1D);
 }
 
-void mesh_t::FaceNodesQuad2D(int _N, dfloat *_r, dfloat *_s, int *_faceNodes){
-  int _Nq = _N+1;
-  int _Nfp = _Nq;
-  int _Np = _Nq*_Nq;
+void mesh_t::FaceNodesQuad2D(const int _N,
+                             const memory<dfloat> _r,
+                             const memory<dfloat> _s,
+                             memory<int>& _faceNodes){
+  const int _Nq = _N+1;
+  const int _Nfp = _Nq;
+  const int _Np = _Nq*_Nq;
 
   int cnt[4];
   for (int i=0;i<4;i++) cnt[i]=0;
@@ -61,6 +69,7 @@ void mesh_t::FaceNodesQuad2D(int _N, dfloat *_r, dfloat *_s, int *_faceNodes){
 
   const dfloat NODETOL = 1000.*deps;
 
+  _faceNodes.malloc(4*_Nfp);
   for (int n=0;n<_Np;n++) {
     if(fabs(_s[n]+1)<NODETOL)
       _faceNodes[0*_Nfp+(cnt[0]++)] = n;
@@ -73,9 +82,12 @@ void mesh_t::FaceNodesQuad2D(int _N, dfloat *_r, dfloat *_s, int *_faceNodes){
   }
 }
 
-void mesh_t::VertexNodesQuad2D(int _N, dfloat *_r, dfloat *_s, int *_vertexNodes){
-  int _Nq = _N+1;
-  int _Np = _Nq*_Nq;
+void mesh_t::VertexNodesQuad2D(const int _N,
+                               const memory<dfloat> _r,
+                               const memory<dfloat> _s,
+                               memory<int>& _vertexNodes){
+  const int _Nq = _N+1;
+  const int _Np = _Nq*_Nq;
 
   dfloat deps = 1.;
   while((1.+deps)>1.)
@@ -83,6 +95,7 @@ void mesh_t::VertexNodesQuad2D(int _N, dfloat *_r, dfloat *_s, int *_vertexNodes
 
   const dfloat NODETOL = 1000.*deps;
 
+  _vertexNodes.malloc(4);
   for(int n=0;n<_Np;++n){
     if( (_r[n]+1)*(_r[n]+1)+(_s[n]+1)*(_s[n]+1)<NODETOL)
       _vertexNodes[0] = n;
@@ -95,27 +108,147 @@ void mesh_t::VertexNodesQuad2D(int _N, dfloat *_r, dfloat *_s, int *_vertexNodes
   }
 }
 
-void mesh_t::EquispacedNodesQuad2D(int _N, dfloat *_r, dfloat *_s){
-  int _Nq = _N+1;
+/*Find a matching array between nodes on matching faces */
+void mesh_t::FaceNodeMatchingQuad2D(const memory<dfloat> _r,
+                                    const memory<dfloat> _s,
+                                    const memory<int> _faceNodes,
+                                    const memory<int> _faceVertices,
+                                    memory<int>& R){
+
+  const int _Nfaces = 4;
+  const int _Nverts = 4;
+  const int _NfaceVertices = 2;
+
+  const int _Nfp = _faceNodes.length()/_Nfaces;
+
+  const dfloat NODETOL = 1.0e-5;
+
+  dfloat V[2] = {-1.0, 1.0};
+
+  dfloat EX0[_Nverts];
+  dfloat EX1[_Nverts];
+
+  memory<dfloat> x0(_Nfp);
+  memory<dfloat> x1(_Nfp);
+
+  R.malloc(_Nfaces*_Nfaces*_NfaceVertices*_Nfp);
+
+  for (int fM=0;fM<_Nfaces;fM++) {
+
+    for (int v=0;v<_Nverts;v++) {
+      EX0[v] = 0.0;
+    }
+    //setup top element with face fM on the bottom
+    for (int v=0;v<_NfaceVertices;v++) {
+      int fv = _faceVertices[fM*_NfaceVertices + v];
+      EX0[fv] = V[v];
+    }
+
+    for(int n=0;n<_Nfp;++n){ /* for each face node */
+      const int fn = _faceNodes[fM*_Nfp+n];
+
+      /* (r,s) coordinates of interpolation nodes*/
+      dfloat rn = _r[fn];
+      dfloat sn = _s[fn];
+
+      /* physical coordinate of interpolation node */
+      x0[n] = 0.25*(1-rn)*(1-sn)*EX0[0]
+             +0.25*(1+rn)*(1-sn)*EX0[1]
+             +0.25*(1+rn)*(1+sn)*EX0[2]
+             +0.25*(1-rn)*(1+sn)*EX0[3];
+    }
+
+    for (int fP=0;fP<_Nfaces;fP++) { /*For each neighbor face */
+      for (int rot=0;rot<_NfaceVertices;rot++) { /* For each face rotation */
+        // Zero vertices
+        for (int v=0;v<_Nverts;v++) {
+          EX1[v] = 0.0;
+        }
+        //setup bottom element with face fP on the top
+        for (int v=0;v<_NfaceVertices;v++) {
+          int fv = _faceVertices[fP*_NfaceVertices + ((v+rot)%_NfaceVertices)];
+          EX1[fv] = V[v];
+        }
+
+        for(int n=0;n<_Nfp;++n){ /* for each node */
+          const int fn = _faceNodes[fP*_Nfp+n];
+
+          /* (r,s,t) coordinates of interpolation nodes*/
+          dfloat rn = _r[fn];
+          dfloat sn = _s[fn];
+
+          /* physical coordinate of interpolation node */
+          x1[n] = 0.25*(1-rn)*(1-sn)*EX1[0]
+             +0.25*(1+rn)*(1-sn)*EX1[1]
+             +0.25*(1+rn)*(1+sn)*EX1[2]
+             +0.25*(1-rn)*(1+sn)*EX1[3];
+        }
+
+        /* for each node on this face find the neighbor node */
+        for(int n=0;n<_Nfp;++n){
+          const dfloat xM = x0[n];
+
+          int m=0;
+          for(;m<_Nfp;++m){ /* for each neighbor node */
+            const dfloat xP = x1[m];
+
+            /* distance between target and neighbor node */
+            const dfloat dist = pow(xM-xP,2);
+
+            /* if neighbor node is close to target, match */
+            if(dist<NODETOL){
+              R[fM*_Nfaces*_NfaceVertices*_Nfp
+                + fP*_NfaceVertices*_Nfp
+                + rot*_Nfp + n] = m;
+              break;
+            }
+          }
+
+          /*Check*/
+          const dfloat xP = x1[m];
+
+          /* distance between target and neighbor node */
+          const dfloat dist = pow(xM-xP,2);
+          //This shouldn't happen
+          LIBP_ABORT("Unable to match face node, face: " << fM
+                     << ", matching face: " << fP
+                     << ", rotation: " << rot
+                     << ", node: " << n
+                     << ". Is the reference node set not symmetric?",
+                     dist>NODETOL);
+        }
+      }
+    }
+  }
+}
+
+void mesh_t::EquispacedNodesQuad2D(const int _N,
+                                   memory<dfloat>& _r,
+                                   memory<dfloat>& _s){
+  const int _Nq = _N+1;
+  const int _Np = _Nq*_Nq;
 
   //Equispaced 1D nodes
-  dfloat *r1D = (dfloat*) malloc(_Nq*sizeof(dfloat));
+  memory<dfloat> r1D;
   EquispacedNodes1D(_N, r1D);
 
   //Tensor product
+  _r.malloc(_Np);
+  _s.malloc(_Np);
   for (int j=0;j<_Nq;j++) {
     for (int i=0;i<_Nq;i++) {
       _r[i+j*_Nq] = r1D[i];
       _s[i+j*_Nq] = r1D[j];
     }
   }
-
-  free(r1D);
 }
 
-void mesh_t::EquispacedEToVQuad2D(int _N, int *_EToV){
-  int _Nq = _N+1;
-  int _Nverts = 3;
+void mesh_t::EquispacedEToVQuad2D(const int _N, memory<int>& _EToV){
+  const int _Nq = _N+1;
+  const int _Nelements = 2*_N*_N;
+  const int _Nverts = 3;
+
+  _EToV.malloc(_Nelements*_Nverts);
 
   //Tensor product
   int cnt=0;
@@ -134,9 +267,12 @@ void mesh_t::EquispacedEToVQuad2D(int _N, int *_EToV){
   }
 }
 
-void mesh_t::SEMFEMEToVQuad2D(int _N, int *_EToV){
-  int _Nq = _N+1;
-  int _Nverts = 4;
+void mesh_t::SEMFEMEToVQuad2D(const int _N, memory<int>& _EToV){
+  const int _Nq = _N+1;
+  const int _Nelements = _N*_N;
+  const int _Nverts = 4;
+
+  _EToV.malloc(_Nelements*_Nverts);
 
   //Tensor product
   int cnt=0;
@@ -154,44 +290,60 @@ void mesh_t::SEMFEMEToVQuad2D(int _N, int *_EToV){
 // ------------------------------------------------------------------------
 // ORTHONORMAL BASIS POLYNOMIALS
 // ------------------------------------------------------------------------
-void mesh_t::OrthonormalBasisQuad2D(dfloat a, dfloat b, int i, int j, dfloat *P){
-  *P = JacobiP(a,0,0,i)*JacobiP(b,0,0,j);
+void mesh_t::OrthonormalBasisQuad2D(const dfloat a, const dfloat b,
+                                    const int i, const int j,
+                                    dfloat& P){
+  P = JacobiP(a,0,0,i)*JacobiP(b,0,0,j);
 }
 
-void mesh_t::GradOrthonormalBasisQuad2D(dfloat a, dfloat b, int i, int j, dfloat *Pr, dfloat *Ps){
-  *Pr = GradJacobiP(a,0,0,i)*JacobiP(b,0,0,j);
-  *Ps = JacobiP(a,0,0,i)*GradJacobiP(b,0,0,j);
+void mesh_t::GradOrthonormalBasisQuad2D(const dfloat a, const dfloat b,
+                                        const int i, const int j,
+                                        dfloat& Pr, dfloat& Ps){
+  Pr = GradJacobiP(a,0,0,i)*JacobiP(b,0,0,j);
+  Ps = JacobiP(a,0,0,i)*GradJacobiP(b,0,0,j);
 }
 
 // ------------------------------------------------------------------------
 // 2D VANDERMONDE MATRICES
 // ------------------------------------------------------------------------
 
-void mesh_t::VandermondeQuad2D(int _N, int Npoints, dfloat *_r, dfloat *_s, dfloat *V){
+void mesh_t::VandermondeQuad2D(const int _N,
+                              const memory<dfloat> _r,
+                              const memory<dfloat> _s,
+                              memory<dfloat>& V){
 
-  int _Nq = _N+1;
-  int _Np = _Nq*_Nq;
+  const int _Nq = _N+1;
+  const int _Np = _Nq*_Nq;
+  const int Npoints = _r.length();
 
+  V.malloc(Npoints*_Np);
   for(int n=0; n<Npoints; n++){
     for(int j=0; j<_Nq; j++){
       for(int i=0; i<_Nq; i++){
         int id = n*_Np+i+j*_Nq;
-        OrthonormalBasisQuad2D(_r[n], _s[n], i, j, V+id);
+        OrthonormalBasisQuad2D(_r[n], _s[n], i, j, V[id]);
       }
     }
   }
 }
 
-void mesh_t::GradVandermondeQuad2D(int _N, int Npoints, dfloat *_r, dfloat *_s, dfloat *Vr, dfloat *Vs){
+void mesh_t::GradVandermondeQuad2D(const int _N,
+                                  const memory<dfloat> _r,
+                                  const memory<dfloat> _s,
+                                  memory<dfloat>& Vr,
+                                  memory<dfloat>& Vs){
 
-  int _Nq = _N+1;
-  int _Np = _Nq*_Nq;
+  const int _Nq = _N+1;
+  const int _Np = _Nq*_Nq;
+  const int Npoints = _r.length();
 
+  Vr.malloc(Npoints*_Np);
+  Vs.malloc(Npoints*_Np);
   for(int n=0; n<Npoints; n++){
     for(int j=0; j<_Nq; j++){
       for(int i=0; i<_Nq; i++){
         int id = n*_Np+i+j*_Nq;
-        GradOrthonormalBasisQuad2D(_r[n], _s[n], i, j, Vr+id, Vs+id);
+        GradOrthonormalBasisQuad2D(_r[n], _s[n], i, j, Vr[id], Vs[id]);
       }
     }
   }
@@ -200,9 +352,12 @@ void mesh_t::GradVandermondeQuad2D(int _N, int Npoints, dfloat *_r, dfloat *_s,
 // ------------------------------------------------------------------------
 // 2D OPERATOR MATRICES
 // ------------------------------------------------------------------------
-void mesh_t::MassMatrixQuad2D(int _Np, dfloat *V, dfloat *_MM){
+void mesh_t::MassMatrixQuad2D(const int _Np,
+                              const memory<dfloat> V,
+                              memory<dfloat>& _MM){
 
-  // masMatrix = inv(V')*inv(V) = inv(V*V')
+  // massMatrix = inv(V')*inv(V) = inv(V*V')
+  _MM.malloc(_Np*_Np);
   for(int n=0;n<_Np;++n){
     for(int m=0;m<_Np;++m){
       dfloat res = 0;
@@ -212,15 +367,18 @@ void mesh_t::MassMatrixQuad2D(int _Np, dfloat *V, dfloat *_MM){
       _MM[n*_Np + m] = res;
     }
   }
-  matrixInverse(_Np, _MM);
+  linAlg_t::matrixInverse(_Np, _MM);
 }
 
-void mesh_t::LumpedMassMatrixQuad2D(int _N, dfloat *_gllw, dfloat *_MM){
+void mesh_t::LumpedMassMatrixQuad2D(const int _N,
+                                    const memory<dfloat> _gllw,
+                                    memory<dfloat>& _MM){
 
-  int _Nq = _N+1;
-  int _Np = _Nq*_Nq;
+  const int _Nq = _N+1;
+  const int _Np = _Nq*_Nq;
 
   // LumpedMassMatrix = gllw \ctimes gllw
+  _MM.malloc(_Np*_Np, 0.0);
   for(int n=0;n<_Nq;++n){
     for(int m=0;m<_Nq;++m){
       int id = n+m*_Nq;
@@ -229,12 +387,15 @@ void mesh_t::LumpedMassMatrixQuad2D(int _N, dfloat *_gllw, dfloat *_MM){
   }
 }
 
-void mesh_t::invLumpedMassMatrixQuad2D(int _N, dfloat *_gllw, dfloat *_invMM){
+void mesh_t::invLumpedMassMatrixQuad2D(const int _N,
+                                       const memory<dfloat> _gllw,
+                                       memory<dfloat>& _invMM){
 
   int _Nq = _N+1;
   int _Np = _Nq*_Nq;
 
   // invLumpedMassMatrix = invgllw \ctimes invgllw
+  _invMM.malloc(_Np*_Np, 0.0);
   for(int n=0;n<_Nq;++n){
     for(int m=0;m<_Nq;++m){
       int id = n+m*_Nq;
@@ -243,43 +404,51 @@ void mesh_t::invLumpedMassMatrixQuad2D(int _N, dfloat *_gllw, dfloat *_invMM){
   }
 }
 
-void mesh_t::DmatrixQuad2D(int _N, int Npoints, dfloat *_r, dfloat *_s,
-                                                dfloat *_Dr, dfloat *_Ds){
+void mesh_t::DmatrixQuad2D(const int _N,
+                           const memory<dfloat> _r,
+                           const memory<dfloat> _s,
+                           memory<dfloat>& _D){
 
-  int _Np = (_N+1)*(_N+1);
+  const int _Nq = _N+1;
+  const int _Np = _Nq*_Nq;
 
-  dfloat *V  = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat));
-  dfloat *Vr = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat));
-  dfloat *Vs = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat));
-
-  VandermondeQuad2D(_N, Npoints, _r, _s, V);
-  GradVandermondeQuad2D(_N, Npoints, _r, _s, Vr, Vs);
+  memory<dfloat> V, Vr, Vs;
+  VandermondeQuad2D(_N, _r, _s, V);
+  GradVandermondeQuad2D(_N, _r, _s, Vr, Vs);
 
   //Dr = Vr/V, Ds = Vs/V
-  matrixRightSolve(_Np, _Np, Vr, _Np, _Np, V, _Dr);
-  matrixRightSolve(_Np, _Np, Vs, _Np, _Np, V, _Ds);
-
-  free(V); free(Vr); free(Vs);
+  _D.malloc(2*_Np*_Np);
+  memory<dfloat> _Dr = _D + 0*_Np*_Np;
+  memory<dfloat> _Ds = _D + 1*_Np*_Np;
+  linAlg_t::matrixRightSolve(_Np, _Np, Vr, _Np, _Np, V, _Dr);
+  linAlg_t::matrixRightSolve(_Np, _Np, Vs, _Np, _Np, V, _Ds);
 }
 
-void mesh_t::InterpolationMatrixQuad2D(int _N,
-                               int NpointsIn, dfloat *rIn, dfloat *sIn,
-                               int NpointsOut, dfloat *rOut, dfloat *sOut,
-                               dfloat *I){
+void mesh_t::InterpolationMatrixQuad2D(const int _N,
+                                       const memory<dfloat> rIn,
+                                       const memory<dfloat> sIn,
+                                       const memory<dfloat> rOut,
+                                       const memory<dfloat> sOut,
+                                       memory<dfloat>& I){
 
-  int _Np = (_N+1)*(_N+1);
+  const int _Nq = _N+1;
+  const int _Np = _Nq*_Nq;
 
-  // need NpointsIn = _Np
-  if (NpointsIn != _Np)
-    LIBP_ABORT(string("Invalid Interplation operator requested."))
+  const int NpointsIn  = rIn.length();
+  const int NpointsOut = rOut.length();
 
-  dfloat *VIn = (dfloat*) malloc(NpointsIn*_Np*sizeof(dfloat));
-  dfloat *VOut= (dfloat*) malloc(NpointsOut*_Np*sizeof(dfloat));
+  // need NpointsIn = _Np
+  LIBP_ABORT("Invalid Interplation operator requested.",
+             NpointsIn != _Np);
 
-  VandermondeQuad2D(_N, NpointsIn,   rIn, sIn, VIn);
-  VandermondeQuad2D(_N, NpointsOut, rOut, sOut, VOut);
+  memory<dfloat> VIn;
+  memory<dfloat> VOut;
+  VandermondeQuad2D(_N, rIn, sIn, VIn);
+  VandermondeQuad2D(_N, rOut, sOut, VOut);
 
-  matrixRightSolve(NpointsOut, _Np, VOut, NpointsIn, _Np, VIn, I);
+  I.malloc(NpointsIn*NpointsOut);
+  linAlg_t::matrixRightSolve(NpointsOut, _Np, VOut,
+                             NpointsIn, _Np, VIn, I);
+}
 
-  free(VIn); free(VOut);
-}
\ No newline at end of file
+} //namespace libp
diff --git a/libs/mesh/meshBasisTet3D.cpp b/libs/mesh/meshBasisTet3D.cpp
index 4d2b0828d..4c1386fa4 100644
--- a/libs/mesh/meshBasisTet3D.cpp
+++ b/libs/mesh/meshBasisTet3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim WarburtonTim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,32 +25,34 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
+
+namespace libp {
 
 // ------------------------------------------------------------------------
 // TET 3D NODES
 // ------------------------------------------------------------------------
-void mesh_t::NodesTet3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t){
-
-  int _Np = (_N+1)*(_N+2)*(_N+3)/6;
-
+void mesh_t::NodesTet3D(const int _N,
+                        memory<dfloat>& _r,
+                        memory<dfloat>& _s,
+                        memory<dfloat>& _t){
   EquispacedNodesTet3D(_N, _r, _s, _t); //make equispaced nodes on reference tet
-  WarpBlendTransformTet3D(_N, _Np, _r, _s, _t); //apply warp&blend transform
+  WarpBlendTransformTet3D(_N, _r, _s, _t); //apply warp&blend transform
 }
 
-void mesh_t::FaceNodesTet3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_faceNodes){
-  int _Nfp = (_N+1)*(_N+2)/2;
-  int _Np = (_N+1)*(_N+2)*(_N+3)/6;
+void mesh_t::FaceNodesTet3D(const int _N,
+                            const memory<dfloat> _r,
+                            const memory<dfloat> _s,
+                            const memory<dfloat> _t,
+                            memory<int>& _faceNodes){
+  const int _Nfp = (_N+1)*(_N+2)/2;
+  const int _Np = (_N+1)*(_N+2)*(_N+3)/6;
 
   int cnt[4];
   for (int i=0;i<4;i++) cnt[i]=0;
 
-  dfloat deps = 1.;
-  while((1.+deps)>1.)
-    deps *= 0.5;
-
-  const dfloat NODETOL = 1000.*deps;
+  const dfloat NODETOL = 1.0e-5;
 
+  _faceNodes.malloc(4*_Nfp);
   for (int n=0;n<_Np;n++) {
     if(fabs(_t[n]+1)<NODETOL)
       _faceNodes[0*_Nfp+(cnt[0]++)] = n;
@@ -63,8 +65,12 @@ void mesh_t::FaceNodesTet3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_fa
   }
 }
 
-void mesh_t::VertexNodesTet3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_vertexNodes){
-  int _Np = (_N+1)*(_N+2)*(_N+3)/6;
+void mesh_t::VertexNodesTet3D(const int _N,
+                              const memory<dfloat> _r,
+                              const memory<dfloat> _s,
+                              const memory<dfloat> _t,
+                              memory<int>& _vertexNodes){
+  const int _Np = (_N+1)*(_N+2)*(_N+3)/6;
 
   dfloat deps = 1.;
   while((1.+deps)>1.)
@@ -72,6 +78,7 @@ void mesh_t::VertexNodesTet3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_
 
   const dfloat NODETOL = 1000.*deps;
 
+  _vertexNodes.malloc(4);
   for(int n=0;n<_Np;++n){
     if( (_r[n]+1)*(_r[n]+1)+(_s[n]+1)*(_s[n]+1)+(_t[n]+1)*(_t[n]+1)<NODETOL)
       _vertexNodes[0] = n;
@@ -84,8 +91,148 @@ void mesh_t::VertexNodesTet3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_
   }
 }
 
+/*Find a matching array between nodes on matching faces */
+void mesh_t::FaceNodeMatchingTet3D(const memory<dfloat> _r,
+                                   const memory<dfloat> _s,
+                                   const memory<dfloat> _t,
+                                   const memory<int> _faceNodes,
+                                   const memory<int> _faceVertices,
+                                   memory<int>& R){
+
+  const int _Nfaces = 4;
+  const int _Nverts = 4;
+  const int _NfaceVertices = 3;
+
+  const int _Nfp = _faceNodes.length()/_Nfaces;
+
+  const dfloat NODETOL = 1.0e-5;
+
+  dfloat V0[3][2] = {{-1.0,-1.0},{ 1.0,-1.0},{-1.0, 1.0}};
+  dfloat V1[3][2] = {{-1.0,-1.0},{-1.0, 1.0},{ 1.0,-1.0}};
+
+  dfloat EX0[_Nverts], EY0[_Nverts];
+  dfloat EX1[_Nverts], EY1[_Nverts];
+
+  memory<dfloat> x0(_Nfp);
+  memory<dfloat> y0(_Nfp);
+
+  memory<dfloat> x1(_Nfp);
+  memory<dfloat> y1(_Nfp);
+
+  R.malloc(_Nfaces*_Nfaces*_NfaceVertices*_Nfp);
+
+  for (int fM=0;fM<_Nfaces;fM++) {
+
+    for (int v=0;v<_Nverts;v++) {
+      EX0[v] = 0.0; EY0[v] = 0.0;
+    }
+    //setup top element with face fM on the bottom
+    for (int v=0;v<_NfaceVertices;v++) {
+      int fv = _faceVertices[fM*_NfaceVertices + v];
+      EX0[fv] = V0[v][0]; EY0[fv] = V0[v][1];
+    }
+
+    for(int n=0;n<_Nfp;++n){ /* for each face node */
+      const int fn = _faceNodes[fM*_Nfp+n];
+
+      /* (r,s,t) coordinates of interpolation nodes*/
+      dfloat rn = _r[fn];
+      dfloat sn = _s[fn];
+      dfloat tn = _t[fn];
+
+      /* physical coordinate of interpolation node */
+      x0[n] = -0.5*(1+rn+sn+tn)*EX0[0]
+             + 0.5*(1+rn)*EX0[1]
+             + 0.5*(1+sn)*EX0[2]
+             + 0.5*(1+tn)*EX0[3];
+      y0[n] = -0.5*(1+rn+sn+tn)*EY0[0]
+             + 0.5*(1+rn)*EY0[1]
+             + 0.5*(1+sn)*EY0[2]
+             + 0.5*(1+tn)*EY0[3];
+    }
+
+    for (int fP=0;fP<_Nfaces;fP++) { /*For each neighbor face */
+      for (int rot=0;rot<_NfaceVertices;rot++) { /* For each face rotation */
+        // Zero vertices
+        for (int v=0;v<_Nverts;v++) {
+          EX1[v] = 0.0; EY1[v] = 0.0;
+        }
+        //setup bottom element with face fP on the top
+        for (int v=0;v<_NfaceVertices;v++) {
+          int fv = _faceVertices[fP*_NfaceVertices + ((v+rot)%_NfaceVertices)];
+          EX1[fv] = V1[v][0]; EY1[fv] = V1[v][1];
+        }
+
+        for(int n=0;n<_Nfp;++n){ /* for each node */
+          const int fn = _faceNodes[fP*_Nfp+n];
+
+          /* (r,s,t) coordinates of interpolation nodes*/
+          dfloat rn = _r[fn];
+          dfloat sn = _s[fn];
+          dfloat tn = _t[fn];
+
+          /* physical coordinate of interpolation node */
+          x1[n] = -0.5*(1+rn+sn+tn)*EX1[0]
+                 + 0.5*(1+rn)*EX1[1]
+                 + 0.5*(1+sn)*EX1[2]
+                 + 0.5*(1+tn)*EX1[3];
+          y1[n] = -0.5*(1+rn+sn+tn)*EY1[0]
+                 + 0.5*(1+rn)*EY1[1]
+                 + 0.5*(1+sn)*EY1[2]
+                 + 0.5*(1+tn)*EY1[3];
+        }
+
+        /* for each node on this face find the neighbor node */
+        for(int n=0;n<_Nfp;++n){
+          const dfloat xM = x0[n];
+          const dfloat yM = y0[n];
+
+          int m=0;
+          for(;m<_Nfp;++m){ /* for each neighbor node */
+            const dfloat xP = x1[m];
+            const dfloat yP = y1[m];
+
+            /* distance between target and neighbor node */
+            const dfloat dist = pow(xM-xP,2) + pow(yM-yP,2);
+
+            /* if neighbor node is close to target, match */
+            if(dist<NODETOL){
+              R[fM*_Nfaces*_NfaceVertices*_Nfp
+                + fP*_NfaceVertices*_Nfp
+                + rot*_Nfp + n] = m;
+              break;
+            }
+          }
+
+          /*Check*/
+          const dfloat xP = x1[m];
+          const dfloat yP = y1[m];
+
+          /* distance between target and neighbor node */
+          const dfloat dist = pow(xM-xP,2) + pow(yM-yP,2);
+          //This shouldn't happen
+          LIBP_ABORT("Unable to match face node, face: " << fM
+                     << ", matching face: " << fP
+                     << ", rotation: " << rot
+                     << ", node: " << n
+                     << ". Is the reference node set not symmetric?",
+                     dist>NODETOL);
+        }
+      }
+    }
+  }
+}
+
 // Create equidistributed nodes on reference tet
-void mesh_t::EquispacedNodesTet3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t){
+void mesh_t::EquispacedNodesTet3D(const int _N,
+                                  memory<dfloat>& _r,
+                                  memory<dfloat>& _s,
+                                  memory<dfloat>& _t){
+  const int _Np = (_N+1)*(_N+2)*(_N+3)/6;
+
+  _r.malloc(_Np);
+  _s.malloc(_Np);
+  _t.malloc(_Np);
 
   int sk = 0;
   for (int k=0;k<_N+1;k++) {
@@ -100,8 +247,11 @@ void mesh_t::EquispacedNodesTet3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t){
   }
 }
 
-void mesh_t::EquispacedEToVTet3D(int _N, int *_EToV){
-  int _Nverts = 4;
+void mesh_t::EquispacedEToVTet3D(const int _N, memory<int>& _EToV){
+  const int _Nverts = 4;
+  const int _Nelements = _N*_N*_N;
+
+  _EToV.malloc(_Nelements*_Nverts);
 
   int cnt=0;
   int sk=0;
@@ -161,14 +311,16 @@ void mesh_t::EquispacedEToVTet3D(int _N, int *_EToV){
   }
 }
 
-void mesh_t::SEMFEMEToVTet3D(int _N, int *_EToV){
+void mesh_t::SEMFEMEToVTet3D(const int _N, memory<int>& _EToV){
   EquispacedEToVTet3D(_N, _EToV);
 }
 
 // ------------------------------------------------------------------------
 // ORTHONORMAL BASIS POLYNOMIALS
 // ------------------------------------------------------------------------
-void mesh_t::OrthonormalBasisTet3D(dfloat _r, dfloat _s, dfloat _t, int i, int j, int k, dfloat *P){
+void mesh_t::OrthonormalBasisTet3D(const dfloat _r, const dfloat _s, const dfloat _t,
+                                   const int i, const int j, const int k,
+                                   dfloat& P){
   // First convert to abc coordinates
   dfloat a, b, c;
   if(fabs(_s+_t)>1e-8)
@@ -187,11 +339,12 @@ void mesh_t::OrthonormalBasisTet3D(dfloat _r, dfloat _s, dfloat _t, int i, int j
   dfloat p2 = JacobiP(b,2*i+1,0,j);
   dfloat p3 = JacobiP(c,2*(i+j)+2,0,k);
 
-  *P = 2.*sqrt(2.0)*p1*p2*p3*pow(1.0-b,i)*pow(1.0-c,i+j);
+  P = 2.*sqrt(2.0)*p1*p2*p3*pow(1.0-b,i)*pow(1.0-c,i+j);
 }
 
-void mesh_t::GradOrthonormalBasisTet3D(dfloat _r, dfloat _s, dfloat _t,
-                                       int i, int j, int k, dfloat *Pr, dfloat *Ps, dfloat *Pt){
+void mesh_t::GradOrthonormalBasisTet3D(const dfloat _r, const dfloat _s, const dfloat _t,
+                                       const int i, const int j, const int k,
+                                       dfloat& Pr, dfloat& Ps, dfloat& Pt){
   // First convert to abc coordinates
   dfloat a, b, c;
   if(fabs(_s+_t)>1e-8)
@@ -214,48 +367,54 @@ void mesh_t::GradOrthonormalBasisTet3D(dfloat _r, dfloat _s, dfloat _t,
   dfloat p2b = GradJacobiP(b,2*i+1,0,j);
   dfloat p3c = GradJacobiP(c,2*(i+j)+2,0,k);
 
-  *Pr = p1a*p2*p3;
+  Pr = p1a*p2*p3;
   if(i>0)
-    *Pr *= pow(0.5*(1.0-b), i-1);
+    Pr *= pow(0.5*(1.0-b), i-1);
   if(i+j>0)
-    *Pr *= pow(0.5*(1.0-c), i+j-1);
+    Pr *= pow(0.5*(1.0-c), i+j-1);
 
-  *Ps = 0.5*(1.0+a)*(*Pr);
+  Ps = 0.5*(1.0+a)*(Pr);
   dfloat tmp = p2b*pow(0.5*(1.0-b), i);
   if(i>0)
     tmp += -0.5*i*p2*pow(0.5*(1.0-b), i-1);
   if(i+j>0)
     tmp *= pow(0.5*(1.0-c), i+j-1);
   tmp *= p1*p3;
-  *Ps += tmp;
+  Ps += tmp;
 
-  *Pt = 0.5*(1.0+a)*(*Pr) + 0.5*(1.0+b)*tmp;
+  Pt = 0.5*(1.0+a)*(Pr) + 0.5*(1.0+b)*tmp;
   tmp = p3c*pow(0.5*(1-c), i+j);
   if(i+j>0)
     tmp -= 0.5*(i+j)*(p3*pow(0.5*(1.0-c), i+j-1));
   tmp *= p1*p2*pow(0.5*(1-b), i);
-  *Pt += tmp;
+  Pt += tmp;
 
-  *Pr *= pow(2, 2*i+j+1.5);
-  *Ps *= pow(2, 2*i+j+1.5);
-  *Pt *= pow(2, 2*i+j+1.5);
+  Pr *= pow(2, 2*i+j+1.5);
+  Ps *= pow(2, 2*i+j+1.5);
+  Pt *= pow(2, 2*i+j+1.5);
 }
 
 // ------------------------------------------------------------------------
 // 3D VANDERMONDE MATRICES
 // ------------------------------------------------------------------------
 
-void mesh_t::VandermondeTet3D(int _N, int Npoints, dfloat *_r, dfloat *_s, dfloat *_t, dfloat *V){
+void mesh_t::VandermondeTet3D(const int _N,
+                              const memory<dfloat> _r,
+                              const memory<dfloat> _s,
+                              const memory<dfloat> _t,
+                              memory<dfloat>& V){
 
-  int _Np = (_N+1)*(_N+2)*(_N+3)/6;
+  const int _Np = (_N+1)*(_N+2)*(_N+3)/6;
+  const int Npoints = _r.length();
 
+  V.malloc(Npoints*_Np);
   for(int n=0; n<Npoints; n++){
     int sk=0;
     for(int i=0; i<_N+1; i++){
       for(int j=0; j<_N+1-i; j++){
         for(int k=0; k<_N+1-i-j; k++){
           int id = n*_Np+sk;
-          OrthonormalBasisTet3D(_r[n], _s[n], _t[n], i, j, k, V+id);
+          OrthonormalBasisTet3D(_r[n], _s[n], _t[n], i, j, k, V[id]);
           sk++;
         }
       }
@@ -263,18 +422,27 @@ void mesh_t::VandermondeTet3D(int _N, int Npoints, dfloat *_r, dfloat *_s, dfloa
   }
 }
 
-void mesh_t::GradVandermondeTet3D(int _N, int Npoints, dfloat *_r, dfloat *_s, dfloat *_t,
-                                  dfloat *Vr, dfloat *Vs, dfloat *Vt){
+void mesh_t::GradVandermondeTet3D(const int _N,
+                                  const memory<dfloat> _r,
+                                  const memory<dfloat> _s,
+                                  const memory<dfloat> _t,
+                                  memory<dfloat>& Vr,
+                                  memory<dfloat>& Vs,
+                                  memory<dfloat>& Vt){
 
-  int _Np = (_N+1)*(_N+2)*(_N+3)/6;
+  const int _Np = (_N+1)*(_N+2)*(_N+3)/6;
+  const int Npoints = _r.length();
 
+  Vr.malloc(Npoints*_Np);
+  Vs.malloc(Npoints*_Np);
+  Vt.malloc(Npoints*_Np);
   for(int n=0; n<Npoints; n++){
     int sk=0;
     for(int i=0; i<_N+1; i++){
       for(int j=0; j<_N+1-i; j++){
         for(int k=0; k<_N+1-i-j; k++){
           int id = n*_Np+sk;
-          GradOrthonormalBasisTet3D(_r[n], _s[n], _t[n], i, j, k, Vr+id, Vs+id, Vt+id);
+          GradOrthonormalBasisTet3D(_r[n], _s[n], _t[n], i, j, k, Vr[id], Vs[id], Vt[id]);
           sk++;
         }
       }
@@ -285,9 +453,12 @@ void mesh_t::GradVandermondeTet3D(int _N, int Npoints, dfloat *_r, dfloat *_s, d
 // ------------------------------------------------------------------------
 // 3D OPERATOR MATRICES
 // ------------------------------------------------------------------------
-void mesh_t::MassMatrixTet3D(int _Np, dfloat *V, dfloat *_MM){
+void mesh_t::MassMatrixTet3D(const int _Np,
+                             const memory<dfloat> V,
+                             memory<dfloat>& _MM){
 
   // massMatrix = inv(V')*inv(V) = inv(V*V')
+  _MM.malloc(_Np*_Np);
   for(int n=0;n<_Np;++n){
     for(int m=0;m<_Np;++m){
       dfloat res = 0;
@@ -297,12 +468,15 @@ void mesh_t::MassMatrixTet3D(int _Np, dfloat *V, dfloat *_MM){
       _MM[n*_Np + m] = res;
     }
   }
-  matrixInverse(_Np, _MM);
+  linAlg_t::matrixInverse(_Np, _MM);
 }
 
-void mesh_t::invMassMatrixTet3D(int _Np, dfloat *V, dfloat *_invMM){
+void mesh_t::invMassMatrixTet3D(const int _Np,
+                                const memory<dfloat> V,
+                                memory<dfloat>& _invMM){
 
   // massMatrix^{-1} = V*V'
+  _invMM.malloc(_Np*_Np);
   for(int n=0;n<_Np;++n){
     for(int m=0;m<_Np;++m){
       dfloat res = 0;
@@ -314,43 +488,46 @@ void mesh_t::invMassMatrixTet3D(int _Np, dfloat *V, dfloat *_invMM){
   }
 }
 
-void mesh_t::DmatrixTet3D(int _N, int Npoints, dfloat *_r, dfloat *_s, dfloat *_t,
-                                               dfloat *_Dr, dfloat *_Ds, dfloat *_Dt){
+void mesh_t::DmatrixTet3D(const int _N,
+                          const memory<dfloat> _r,
+                          const memory<dfloat> _s,
+                          const memory<dfloat> _t,
+                          memory<dfloat>& _D){
 
-  int _Np = (_N+1)*(_N+2)*(_N+3)/6;
+  const int _Np = (_N+1)*(_N+2)*(_N+3)/6;
 
-  dfloat *V  = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat));
-  dfloat *Vr = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat));
-  dfloat *Vs = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat));
-  dfloat *Vt = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat));
-
-  VandermondeTet3D(_N, Npoints, _r, _s, _t, V);
-  GradVandermondeTet3D(_N, Npoints, _r, _s, _t, Vr, Vs, Vt);
+  memory<dfloat> V, Vr, Vs, Vt;
+  VandermondeTet3D(_N, _r, _s, _t, V);
+  GradVandermondeTet3D(_N, _r, _s, _t, Vr, Vs, Vt);
 
   //Dr = Vr/V, Ds = Vs/V
-  matrixRightSolve(_Np, _Np, Vr, _Np, _Np, V, _Dr);
-  matrixRightSolve(_Np, _Np, Vs, _Np, _Np, V, _Ds);
-  matrixRightSolve(_Np, _Np, Vt, _Np, _Np, V, _Dt);
-
-  free(V); free(Vr); free(Vs); free(Vt);
+  _D.malloc(3*_Np*_Np);
+  memory<dfloat> _Dr = _D + 0*_Np*_Np;
+  memory<dfloat> _Ds = _D + 1*_Np*_Np;
+  memory<dfloat> _Dt = _D + 2*_Np*_Np;
+  linAlg_t::matrixRightSolve(_Np, _Np, Vr, _Np, _Np, V, _Dr);
+  linAlg_t::matrixRightSolve(_Np, _Np, Vs, _Np, _Np, V, _Ds);
+  linAlg_t::matrixRightSolve(_Np, _Np, Vt, _Np, _Np, V, _Dt);
 }
 
-void mesh_t::LIFTmatrixTet3D(int _N, int *_faceNodes,
-                             dfloat *_r, dfloat *_s, dfloat *_t, dfloat *_LIFT){
+void mesh_t::LIFTmatrixTet3D(const int _N,
+                             const memory<int> _faceNodes,
+                             const memory<dfloat> _r,
+                             const memory<dfloat> _s,
+                             const memory<dfloat> _t,
+                             memory<dfloat>& _LIFT){
 
-  int _Nfp = (_N+1)*(_N+2)/2;
-  int _Np = (_N+1)*(_N+2)*(_N+3)/6;
-  int _Nfaces = 4;
+  const int _Nfp = (_N+1)*(_N+2)/2;
+  const int _Np = (_N+1)*(_N+2)*(_N+3)/6;
+  const int _Nfaces = 4;
 
-  dfloat *E = (dfloat *) calloc(_Np*_Nfaces*_Nfp, sizeof(dfloat));
+  memory<dfloat> E(_Np*_Nfaces*_Nfp, 0);
 
-  dfloat *r2D = (dfloat *) malloc(_Nfp*sizeof(dfloat));
-  dfloat *s2D = (dfloat *) malloc(_Nfp*sizeof(dfloat));
-  dfloat *V2D = (dfloat *) malloc(_Nfp*_Nfp*sizeof(dfloat));
-  dfloat *MM2D = (dfloat *) malloc(_Nfp*_Nfp*sizeof(dfloat));
+  memory<dfloat> r2D(_Nfp);
+  memory<dfloat> s2D(_Nfp);
 
   for (int f=0;f<_Nfaces;f++) {
-    dfloat *rFace, *sFace;
+    memory<dfloat> rFace, sFace;
     if (f==0) {rFace = _r; sFace = _s;}
     if (f==1) {rFace = _r; sFace = _t;}
     if (f==2) {rFace = _s; sFace = _t;}
@@ -361,7 +538,8 @@ void mesh_t::LIFTmatrixTet3D(int _N, int *_faceNodes,
       s2D[i] = sFace[_faceNodes[f*_Nfp+i]];
     }
 
-    VandermondeTri2D(_N, _Nfp, r2D, s2D, V2D);
+    memory<dfloat> V2D, MM2D;
+    VandermondeTri2D(_N, r2D, s2D, V2D);
     MassMatrixTri2D(_Nfp, V2D, MM2D);
 
     for (int j=0;j<_Nfp;j++) {
@@ -372,9 +550,10 @@ void mesh_t::LIFTmatrixTet3D(int _N, int *_faceNodes,
     }
   }
 
-  dfloat *V = (dfloat *) malloc(_Np*_Np*sizeof(dfloat));
-  VandermondeTet3D(_N, _Np, _r, _s, _t, V);
+  memory<dfloat> V;
+  VandermondeTet3D(_N, _r, _s, _t, V);
 
+  _LIFT.malloc(_Np*_Nfaces*_Nfp);
   for (int n=0;n<_Np;n++) {
     for (int m=0;m<_Nfaces*_Nfp;m++) {
 
@@ -388,16 +567,18 @@ void mesh_t::LIFTmatrixTet3D(int _N, int *_faceNodes,
       }
     }
   }
-
-  free(V); free(r2D); free(s2D); free(V2D); free(MM2D); free(E);
 }
 
-void mesh_t::SurfaceMassMatrixTet3D(int _N, dfloat *_MM, dfloat *_LIFT, dfloat *_sM){
+void mesh_t::SurfaceMassMatrixTet3D(const int _N,
+                                    const memory<dfloat> _MM,
+                                    const memory<dfloat> _LIFT,
+                                    memory<dfloat>& _sM){
 
-  int _Nfp = (_N+1)*(_N+2)/2;
-  int _Np = (_N+1)*(_N+2)*(_N+3)/6;
-  int _Nfaces = 4;
+  const int _Nfp = (_N+1)*(_N+2)/2;
+  const int _Np = (_N+1)*(_N+2)*(_N+3)/6;
+  const int _Nfaces = 4;
 
+  _sM.malloc(_Np*_Nfaces*_Nfp);
   for (int n=0;n<_Np;n++) {
     for (int m=0;m<_Nfp*_Nfaces;m++) {
       _sM[m+n*_Nfp*_Nfaces] = 0;
@@ -408,12 +589,22 @@ void mesh_t::SurfaceMassMatrixTet3D(int _N, dfloat *_MM, dfloat *_LIFT, dfloat *
   }
 }
 
-void mesh_t::SmatrixTet3D(int _N, dfloat *_Dr, dfloat *_Ds, dfloat *_Dt, dfloat *_MM,
-                          dfloat *_Srr, dfloat *_Srs, dfloat *_Srt,
-                          dfloat *_Sss, dfloat *_Sst, dfloat *_Stt){
-
-  int _Np = (_N+1)*(_N+2)*(_N+3)/6;
-
+void mesh_t::SmatrixTet3D(const int _N,
+                          const memory<dfloat> _Dr,
+                          const memory<dfloat> _Ds,
+                          const memory<dfloat> _Dt,
+                          const memory<dfloat> _MM,
+                          memory<dfloat>& _S){
+
+  const int _Np = (_N+1)*(_N+2)*(_N+3)/6;
+
+  _S.malloc(6*_Np*_Np, 0.0);
+  memory<dfloat> _Srr = _S + 0*_Np*_Np;
+  memory<dfloat> _Srs = _S + 1*_Np*_Np;
+  memory<dfloat> _Srt = _S + 2*_Np*_Np;
+  memory<dfloat> _Sss = _S + 3*_Np*_Np;
+  memory<dfloat> _Sst = _S + 4*_Np*_Np;
+  memory<dfloat> _Stt = _S + 5*_Np*_Np;
   for (int n=0;n<_Np;n++) {
     for (int m=0;m<_Np;m++) {
       for (int k=0;k<_Np;k++) {
@@ -433,59 +624,65 @@ void mesh_t::SmatrixTet3D(int _N, dfloat *_Dr, dfloat *_Ds, dfloat *_Dt, dfloat
   }
 }
 
-void mesh_t::InterpolationMatrixTet3D(int _N,
-                               int NpointsIn, dfloat *rIn, dfloat *sIn, dfloat *tIn,
-                               int NpointsOut, dfloat *rOut, dfloat *sOut, dfloat *tOut,
-                               dfloat *I){
-
-  int _Np = (_N+1)*(_N+2)*(_N+3)/6;
+void mesh_t::InterpolationMatrixTet3D(const int _N,
+                                      const memory<dfloat> rIn,
+                                      const memory<dfloat> sIn,
+                                      const memory<dfloat> tIn,
+                                      const memory<dfloat> rOut,
+                                      const memory<dfloat> sOut,
+                                      const memory<dfloat> tOut,
+                                      memory<dfloat>& I){
 
-  // need NpointsIn = _Np
-  if (NpointsIn != _Np)
-    LIBP_ABORT(string("Invalid Interplation operator requested."))
+  const int _Np = (_N+1)*(_N+2)*(_N+3)/6;
 
-  dfloat *VIn = (dfloat*) malloc(NpointsIn*_Np*sizeof(dfloat));
-  dfloat *VOut= (dfloat*) malloc(NpointsOut*_Np*sizeof(dfloat));
+  const int NpointsIn  = rIn.length();
+  const int NpointsOut = rOut.length();
 
-  VandermondeTet3D(_N, NpointsIn,   rIn, sIn, tIn, VIn);
-  VandermondeTet3D(_N, NpointsOut, rOut, sOut, tOut, VOut);
+  // need NpointsIn = _Np
+  LIBP_ABORT("Invalid Interplation operator requested.",
+             NpointsIn != _Np);
 
-  matrixRightSolve(NpointsOut, _Np, VOut, NpointsIn, _Np, VIn, I);
+  memory<dfloat> VIn;
+  memory<dfloat> VOut;
+  VandermondeTet3D(_N, rIn, sIn, tIn, VIn);
+  VandermondeTet3D(_N, rOut, sOut, tOut, VOut);
 
-  free(VIn); free(VOut);
+  I.malloc(NpointsIn*NpointsOut);
+  linAlg_t::matrixRightSolve(NpointsOut, _Np, VOut,
+                             NpointsIn, _Np, VIn, I);
 }
 
-void mesh_t::DegreeRaiseMatrixTet3D(int Nc, int Nf, dfloat *P){
-
-  int Npc = (Nc+1)*(Nc+2)*(Nc+3)/6;
-  int Npf = (Nf+1)*(Nf+2)*(Nf+3)/6;
-
-  dfloat *rc = (dfloat *) malloc(Npc*sizeof(dfloat));
-  dfloat *sc = (dfloat *) malloc(Npc*sizeof(dfloat));
-  dfloat *tc = (dfloat *) malloc(Npc*sizeof(dfloat));
-  dfloat *rf = (dfloat *) malloc(Npf*sizeof(dfloat));
-  dfloat *sf = (dfloat *) malloc(Npf*sizeof(dfloat));
-  dfloat *tf = (dfloat *) malloc(Npf*sizeof(dfloat));
+void mesh_t::DegreeRaiseMatrixTet3D(const int Nc, const int Nf,
+                                    memory<dfloat>& P){
 
+  memory<dfloat> rc, sc, tc;
+  memory<dfloat> rf, sf, tf;
   NodesTet3D(Nc, rc, sc, tc);
   NodesTet3D(Nf, rf, sf, tf);
 
-  InterpolationMatrixTet3D(Nc, Npc, rc, sc, tc, Npf, rf, sf, tf, P);
-
-  free(rc); free(sc); free(tc); free(rf); free(sf); free(tf);
+  InterpolationMatrixTet3D(Nc, rc, sc, tc, rf, sf, tf, P);
 }
 
-void mesh_t::CubaturePmatrixTet3D(int _N, int _Np, dfloat *_r, dfloat *_s, dfloat *_t,
-                                  int _cubNp, dfloat *_cubr, dfloat *_cubs, dfloat *_cubt,
-                                  dfloat *_cubProject){
+void mesh_t::CubaturePmatrixTet3D(const int _N,
+                                  const memory<dfloat> _r,
+                                  const memory<dfloat> _s,
+                                  const memory<dfloat> _t,
+                                  const memory<dfloat> _cubr,
+                                  const memory<dfloat> _cubs,
+                                  const memory<dfloat> _cubt,
+                                  memory<dfloat>& _cubProject){
+
+  const int _Np = (_N+1)*(_N+2)*(_N+3)/6;
+  const int _cubNp = _cubr.length();
 
-  dfloat *V = (dfloat*) malloc(_Np*_Np*sizeof(dfloat));
-  VandermondeTet3D(_N, _Np, _r, _s, _t, V);
+  memory<dfloat> V;
+  VandermondeTet3D(_N, _r, _s, _t, V);
 
-  dfloat *cubV  = (dfloat*) malloc(_cubNp*_Np*sizeof(dfloat));
-  VandermondeTet3D(_N, _cubNp, _cubr, _cubs, _cubt, cubV);
+  memory<dfloat> cubV;
+  VandermondeTet3D(_N, _cubr, _cubs, _cubt, cubV);
 
   // cubProject = V*cV' %% relies on (transpose(cV)*diag(cubw)*cV being the identity)
+  _cubProject.malloc(_Np*_cubNp);
   for(int n=0;n<_Np;++n){
     for(int m=0;m<_cubNp;++m){
       dfloat resP = 0;
@@ -495,26 +692,34 @@ void mesh_t::CubaturePmatrixTet3D(int _N, int _Np, dfloat *_r, dfloat *_s, dfloa
      _cubProject[n*_cubNp+m] = resP;
     }
   }
-  free(V); free(cubV);
 }
 
-void mesh_t::CubatureWeakDmatricesTet3D(int _N, int _Np, dfloat *_r, dfloat *_s, dfloat *_t,
-                                        int _cubNp, dfloat *_cubr, dfloat *_cubs, dfloat *_cubt,
-                                        dfloat *_cubPDrT, dfloat *_cubPDsT, dfloat *_cubPDtT){
+void mesh_t::CubatureWeakDmatricesTet3D(const int _N,
+                                        const memory<dfloat> _r,
+                                        const memory<dfloat> _s,
+                                        const memory<dfloat> _t,
+                                        const memory<dfloat> _cubr,
+                                        const memory<dfloat> _cubs,
+                                        const memory<dfloat> _cubt,
+                                        memory<dfloat>& _cubPDT){
 
-  dfloat *V = (dfloat*) malloc(_Np*_Np*sizeof(dfloat));
-  VandermondeTet3D(_N, _Np, _r, _s, _t, V);
+  const int _Np = (_N+1)*(_N+2)*(_N+3)/6;
+  const int _cubNp = _cubr.length();
 
-  dfloat *cubV  = (dfloat*) malloc(_cubNp*_Np*sizeof(dfloat));
-  dfloat *cubVr = (dfloat*) malloc(_cubNp*_Np*sizeof(dfloat));
-  dfloat *cubVs = (dfloat*) malloc(_cubNp*_Np*sizeof(dfloat));
-  dfloat *cubVt = (dfloat*) malloc(_cubNp*_Np*sizeof(dfloat));
-  VandermondeTet3D(_N, _cubNp, _cubr, _cubs, _cubt, cubV);
-  GradVandermondeTet3D(_N, _cubNp, _cubr, _cubs, _cubt, cubVr, cubVs, cubVt);
+  memory<dfloat> V;
+  VandermondeTet3D(_N, _r, _s, _t, V);
+
+  memory<dfloat> cubV, cubVr, cubVs, cubVt;
+  VandermondeTet3D(_N, _cubr, _cubs, _cubt, cubV);
+  GradVandermondeTet3D(_N, _cubr, _cubs, _cubt, cubVr, cubVs, cubVt);
 
   // cubPDrT = V*transpose(cVr);
   // cubPDsT = V*transpose(cVs);
   // cubPDtT = V*transpose(cVt);
+  _cubPDT.malloc(3*_Np*_cubNp);
+  memory<dfloat> _cubPDrT = _cubPDT + 0*_Np*_cubNp;
+  memory<dfloat> _cubPDsT = _cubPDT + 1*_Np*_cubNp;
+  memory<dfloat> _cubPDtT = _cubPDT + 2*_Np*_cubNp;
   for(int n=0;n<_Np;++n){
     for(int m=0;m<_cubNp;++m){
       dfloat resPDrT = 0, resPDsT = 0, resPDtT = 0;
@@ -531,23 +736,31 @@ void mesh_t::CubatureWeakDmatricesTet3D(int _N, int _Np, dfloat *_r, dfloat *_s,
       _cubPDtT[n*_cubNp+m] = resPDtT;
     }
   }
-  free(V); free(cubV); free(cubVr); free(cubVs); free(cubVt);
 }
 
-void mesh_t::CubatureSurfaceMatricesTet3D(int _N, int _Np, dfloat *_r, dfloat *_s, dfloat *_t, int *_faceNodes,
-                                    int _intNfp, dfloat *_intr, dfloat *_ints, dfloat *_intw,
-                                    dfloat *_intInterp, dfloat *_intLIFT){
-
-  int _Nfaces = 4;
-  int _Nfp = (_N+1)*(_N+2)/2;
-
-  dfloat *V = (dfloat*) malloc(_Np*_Np*sizeof(dfloat));
-  VandermondeTet3D(_N, _Np, _r, _s, _t, V);
-
-  dfloat *ir = (dfloat*) calloc(_intNfp*_Nfaces, sizeof(dfloat));
-  dfloat *is = (dfloat*) calloc(_intNfp*_Nfaces, sizeof(dfloat));
-  dfloat *it = (dfloat*) calloc(_intNfp*_Nfaces, sizeof(dfloat));
-  dfloat *iw = (dfloat*) calloc(_intNfp*_Nfaces, sizeof(dfloat));
+void mesh_t::CubatureSurfaceMatricesTet3D(const int _N,
+                                          const memory<dfloat> _r,
+                                          const memory<dfloat> _s,
+                                          const memory<dfloat> _t,
+                                          const memory<int> _faceNodes,
+                                          const memory<dfloat> _intr,
+                                          const memory<dfloat> _ints,
+                                          const memory<dfloat> _intw,
+                                          memory<dfloat>& _intInterp,
+                                          memory<dfloat>& _intLIFT){
+
+  const int _Np = (_N+1)*(_N+2)*(_N+3)/6;
+  const int _Nfp = (_N+1)*(_N+2)/2;
+  const int _Nfaces = 4;
+  const int _intNfp = _intr.length();
+
+  memory<dfloat> V;
+  VandermondeTet3D(_N, _r, _s, _t, V);
+
+  memory<dfloat> ir(_intNfp*_Nfaces);
+  memory<dfloat> is(_intNfp*_Nfaces);
+  memory<dfloat> it(_intNfp*_Nfaces);
+  memory<dfloat> iw(_intNfp*_Nfaces);
 
   for(int n=0;n<_intNfp;++n){
     ir[0*_intNfp + n] =  _intr[n];
@@ -571,9 +784,10 @@ void mesh_t::CubatureSurfaceMatricesTet3D(int _N, int _Np, dfloat *_r, dfloat *_
     iw[3*_intNfp + n] =  _intw[n];
   }
 
-  dfloat *sInterp = (dfloat*) malloc(_intNfp*_Nfaces*_Np*sizeof(dfloat));
-  InterpolationMatrixTet3D(_N, _Np, _r, _s, _t, _Nfaces*_intNfp, ir, is, it, sInterp);
+  memory<dfloat> sInterp;
+  InterpolationMatrixTet3D(_N, _r, _s, _t, ir, is, it, sInterp);
 
+  _intInterp.malloc(_Nfaces*_intNfp*_Nfp);
   for(int n=0;n<_intNfp;++n){
     for(int m=0;m<_Nfp;++m){
       _intInterp[0*_intNfp*_Nfp + n*_Nfp + m] = sInterp[(n+0*_intNfp)*_Np+_faceNodes[0*_Nfp+m]];
@@ -585,6 +799,7 @@ void mesh_t::CubatureSurfaceMatricesTet3D(int _N, int _Np, dfloat *_r, dfloat *_
 
   // integration node lift matrix
   //iLIFT = V*V'*sInterp'*diag(iw(:));
+  _intLIFT.malloc(_Nfaces*_intNfp*_Np);
   for(int n=0;n<_Nfaces*_intNfp;++n){
     for(int m=0;m<_Np;++m){
       _intLIFT[m*_Nfaces*_intNfp+n] = 0.0;
@@ -595,19 +810,24 @@ void mesh_t::CubatureSurfaceMatricesTet3D(int _N, int _Np, dfloat *_r, dfloat *_
       }
     }
   }
-
-  free(V); free(ir);  free(is); free(it); free(iw);  free(sInterp);
 }
 
-void mesh_t::SEMFEMInterpMatrixTet3D(int _N,
-                                    int _Np, dfloat *_r, dfloat *_s, dfloat *_t,
-                                    int _NpFEM, dfloat *_rFEM, dfloat *_sFEM, dfloat *_tFEM,
-                                    dfloat *I){
+void mesh_t::SEMFEMInterpMatrixTet3D(const int _N,
+                                     const memory<dfloat> _r,
+                                     const memory<dfloat> _s,
+                                     const memory<dfloat> _t,
+                                     const memory<dfloat> _rFEM,
+                                     const memory<dfloat> _sFEM,
+                                     const memory<dfloat> _tFEM,
+                                     memory<dfloat>& I){
+
+  const int _Np = (_N+1)*(_N+2)*(_N+3)/6;
+  const int _NpFEM = _rFEM.length();
 
-  dfloat *IQN = (dfloat*) malloc(_NpFEM*_Np*sizeof(dfloat));
-  InterpolationMatrixTet3D(_N, _Np, _r, _s, _t, _NpFEM, _rFEM, _sFEM, _tFEM, IQN);
+  memory<dfloat> IQN;
+  InterpolationMatrixTet3D(_N, _r, _s, _t, _rFEM, _sFEM, _tFEM, IQN);
 
-  dfloat *IQTIQ = (dfloat*) malloc(_Np*_Np*sizeof(dfloat));
+  memory<dfloat> IQTIQ(_Np*_Np);
   // IQTIQ = IQN'*IQN
   for(int n=0;n<_Np;++n){
     for(int m=0;m<_Np;++m){
@@ -619,9 +839,7 @@ void mesh_t::SEMFEMInterpMatrixTet3D(int _N,
   }
 
   // I = IQN/(IQN'*IQN)  - pseudo inverse
-  matrixRightSolve(_NpFEM, _Np, IQN, _Np, _Np, IQTIQ, I);
-
-  free(IQN); free(IQTIQ);
+  linAlg_t::matrixRightSolve(_NpFEM, _Np, IQN, _Np, _Np, IQTIQ, I);
 }
 
 // ------------------------------------------------------------------------
@@ -630,16 +848,24 @@ void mesh_t::SEMFEMInterpMatrixTet3D(int _N,
 //                       Journal of engineering mathematics, 56(3), 247-262.
 // ------------------------------------------------------------------------
 
-static void xyztorst(int Npoints, dfloat *x, dfloat *y, dfloat *z, dfloat *r, dfloat *s, dfloat *t) {
+static void xyztorst(const memory<dfloat> x,
+                     const memory<dfloat> y,
+                     const memory<dfloat> z,
+                     memory<dfloat> r,
+                     memory<dfloat> s,
+                     memory<dfloat> t) {
+
+  const int Npoints = x.length();
+
   // vertices of tetrahedron
   dfloat v1[3] = {-1.0, -1./sqrt(3.), -1./sqrt(6.)};
   dfloat v2[3] = { 1.0, -1./sqrt(3.), -1./sqrt(6.)};
   dfloat v3[3] = { 0.0,  2./sqrt(3.), -1./sqrt(6.)};
   dfloat v4[3] = { 0.0,  0.,           3./sqrt(6.)};
 
-  dfloat *XYZ = (dfloat *) malloc(3*Npoints*sizeof(dfloat));
-  dfloat *RST = (dfloat *) malloc(3*Npoints*sizeof(dfloat));
-  dfloat *A = (dfloat *) malloc(3*3*sizeof(dfloat));
+  memory<dfloat> XYZ(3*Npoints);
+  memory<dfloat> RST(3*Npoints);
+  memory<dfloat> A(3*3);
 
   for (int i=0;i<3;i++) {
     A[0*3+i] = 0.5*(v2[i]-v1[i]);
@@ -653,30 +879,33 @@ static void xyztorst(int Npoints, dfloat *x, dfloat *y, dfloat *z, dfloat *r, df
     XYZ[3*n+2] = z[n]-0.5*(v2[2]+v3[2]+v4[2]-v1[2]);
   }
 
-  matrixRightSolve(Npoints, 3, XYZ, 3, 3, A, RST);
+  linAlg_t::matrixRightSolve(Npoints, 3, XYZ, 3, 3, A, RST);
 
   for (int n=0;n<Npoints;n++) {
     r[n] = RST[3*n+0];
     s[n] = RST[3*n+1];
     t[n] = RST[3*n+2];
   }
-
-  free(XYZ); free(RST); free(A);
 }
 
-void mesh_t::WarpShiftFace3D(int _N, int Npoints, dfloat alpha,
-                             dfloat *L1, dfloat *L2, dfloat *L3,
-                             dfloat *w1, dfloat *w2) {
+void mesh_t::WarpShiftFace3D(const int _N, const dfloat alpha,
+                             const memory<dfloat> L1,
+                             const memory<dfloat> L2,
+                             const memory<dfloat> L3,
+                             memory<dfloat> w1,
+                             memory<dfloat> w2) {
   // Compute scaled warp function at order N
   // based on rout interpolation nodes
 
-  dfloat *dL32 = (dfloat*) malloc(Npoints*sizeof(dfloat));
-  dfloat *dL13 = (dfloat*) malloc(Npoints*sizeof(dfloat));
-  dfloat *dL21 = (dfloat*) malloc(Npoints*sizeof(dfloat));
+  const int Npoints = L1.length();
 
-  dfloat *warpf1 = (dfloat*) malloc(Npoints*sizeof(dfloat));
-  dfloat *warpf2 = (dfloat*) malloc(Npoints*sizeof(dfloat));
-  dfloat *warpf3 = (dfloat*) malloc(Npoints*sizeof(dfloat));
+  memory<dfloat> dL32(Npoints);
+  memory<dfloat> dL13(Npoints);
+  memory<dfloat> dL21(Npoints);
+
+  memory<dfloat> warpf1(Npoints);
+  memory<dfloat> warpf2(Npoints);
+  memory<dfloat> warpf3(Npoints);
 
   for (int n=0;n<Npoints;n++) {
     dL32[n] = L3[n]-L2[n];
@@ -684,32 +913,35 @@ void mesh_t::WarpShiftFace3D(int _N, int Npoints, dfloat alpha,
     dL21[n] = L2[n]-L1[n];
   }
 
-  Warpfactor(_N, Npoints, dL32, warpf1);
-  Warpfactor(_N, Npoints, dL13, warpf2);
-  Warpfactor(_N, Npoints, dL21, warpf3);
+  Warpfactor(_N, dL32, warpf1);
+  Warpfactor(_N, dL13, warpf2);
+  Warpfactor(_N, dL21, warpf3);
 
   for (int n=0;n<Npoints;n++) {
-    dfloat blend1 = 4.0*L2[n]*L3[n];
-    dfloat blend2 = 4.0*L3[n]*L1[n];
-    dfloat blend3 = 4.0*L1[n]*L2[n];
+    const dfloat blend1 = 4.0*L2[n]*L3[n];
+    const dfloat blend2 = 4.0*L3[n]*L1[n];
+    const dfloat blend3 = 4.0*L1[n]*L2[n];
 
-    dfloat warp1 = blend1*warpf1[n]*(1.0+alpha*alpha*L1[n]*L1[n]);
-    dfloat warp2 = blend2*warpf2[n]*(1.0+alpha*alpha*L2[n]*L2[n]);
-    dfloat warp3 = blend3*warpf3[n]*(1.0+alpha*alpha*L3[n]*L3[n]);
+    const dfloat warp1 = blend1*warpf1[n]*(1.0+alpha*alpha*L1[n]*L1[n]);
+    const dfloat warp2 = blend2*warpf2[n]*(1.0+alpha*alpha*L2[n]*L2[n]);
+    const dfloat warp3 = blend3*warpf3[n]*(1.0+alpha*alpha*L3[n]*L3[n]);
 
     w1[n] = 1.*warp1 + cos(2.*M_PI/3.)*warp2 + cos(4.*M_PI/3.)*warp3;
     w2[n] = 0.*warp1 + sin(2.*M_PI/3.)*warp2 + sin(4.*M_PI/3.)*warp3;
   }
-
-  free(dL32); free(dL21); free(dL13);
-  free(warpf1); free(warpf2); free(warpf3);
 }
 
-void mesh_t::WarpBlendTransformTet3D(int _N, int _Npoints, dfloat *_r, dfloat *_s, dfloat *_t, dfloat alphaIn){
+void mesh_t::WarpBlendTransformTet3D(const int _N,
+                                     memory<dfloat> _r,
+                                     memory<dfloat> _s,
+                                     memory<dfloat> _t,
+                                     const dfloat alphaIn){
 
   const dfloat alpopt[15] = {0.0000,0.0000,0.00000,0.1002,1.1332,1.5608,1.3413,
                              1.2577,1.1603,1.10153,0.6080,0.4523,0.8856,0.8717,0.9655};
 
+  const int _Npoints = _r.length();
+
   dfloat alpha;
   if (alphaIn==-1) {
     if (_N<16) {
@@ -746,18 +978,18 @@ void mesh_t::WarpBlendTransformTet3D(int _N, int _Npoints, dfloat *_r, dfloat *_
   }
 
   // Convert r s coordinates to points in equilateral triangle
-  dfloat *L1 = (dfloat*) malloc(_Npoints*sizeof(dfloat));
-  dfloat *L2 = (dfloat*) malloc(_Npoints*sizeof(dfloat));
-  dfloat *L3 = (dfloat*) malloc(_Npoints*sizeof(dfloat));
-  dfloat *L4 = (dfloat*) malloc(_Npoints*sizeof(dfloat));
+  memory<dfloat> L1(_Npoints);
+  memory<dfloat> L2(_Npoints);
+  memory<dfloat> L3(_Npoints);
+  memory<dfloat> L4(_Npoints);
 
-  dfloat *_x = (dfloat*) malloc(_Npoints*sizeof(dfloat));
-  dfloat *_y = (dfloat*) malloc(_Npoints*sizeof(dfloat));
-  dfloat *_z = (dfloat*) malloc(_Npoints*sizeof(dfloat));
+  memory<dfloat> _x(_Npoints);
+  memory<dfloat> _y(_Npoints);
+  memory<dfloat> _z(_Npoints);
 
-  dfloat *shiftx = (dfloat*) calloc(_Npoints,sizeof(dfloat));
-  dfloat *shifty = (dfloat*) calloc(_Npoints,sizeof(dfloat));
-  dfloat *shiftz = (dfloat*) calloc(_Npoints,sizeof(dfloat));
+  memory<dfloat> shiftx(_Npoints,0.0);
+  memory<dfloat> shifty(_Npoints,0.0);
+  memory<dfloat> shiftz(_Npoints,0.0);
 
   for (int n=0;n<_Npoints;n++) {
     L1[n] =  0.5*(1.+_t[n]);
@@ -770,18 +1002,18 @@ void mesh_t::WarpBlendTransformTet3D(int _N, int _Npoints, dfloat *_r, dfloat *_
     _z[n] =  L3[n]*v1[2]+L4[n]*v2[2]+L2[n]*v3[2]+L1[n]*v4[2];
   }
 
-  dfloat *warp1 = (dfloat*) calloc(_Npoints,sizeof(dfloat));
-  dfloat *warp2 = (dfloat*) calloc(_Npoints,sizeof(dfloat));
+  memory<dfloat> warp1(_Npoints);
+  memory<dfloat> warp2(_Npoints);
 
   for (int f=0;f<4;f++) {
-    dfloat *La, *Lb, *Lc, *Ld;
+    memory<dfloat> La, Lb, Lc, Ld;
     if(f==0) {La = L1; Lb = L2; Lc = L3; Ld = L4;}
     if(f==1) {La = L2; Lb = L1; Lc = L3; Ld = L4;}
     if(f==2) {La = L3; Lb = L1; Lc = L4; Ld = L2;}
     if(f==3) {La = L4; Lb = L1; Lc = L3; Ld = L2;}
 
     // compute warp tangential to face
-    WarpShiftFace3D(_N, _Npoints, alpha, Lb, Lc, Ld, warp1, warp2);
+    WarpShiftFace3D(_N, alpha, Lb, Lc, Ld, warp1, warp2);
 
     for (int n=0;n<_Npoints;n++) {
       dfloat blend = Lb[n]*Lc[n]*Ld[n];
@@ -811,12 +1043,7 @@ void mesh_t::WarpBlendTransformTet3D(int _N, int _Npoints, dfloat *_r, dfloat *_
     _z[n] += shiftz[n];
   }
 
-  xyztorst(_Npoints, _x, _y, _z, _r, _s, _t);
-
-  free(L1); free(L2); free(L3); free(L4);
-  free(warp1); free(warp2);
-  free(shiftx); free(shifty); free(shiftz);
-  free(_x); free(_y); free(_z);
+  xyztorst(_x, _y, _z, _r, _s, _t);
 }
 
 // ------------------------------------------------------------------------
@@ -901,19 +1128,22 @@ static const dfloat cubT15[214] = {-3.592259421353274e-01,-3.592259421353629e-01
 static const dfloat cubW15[214] = { 3.522723551354820e-03, 3.522723551352486e-03, 3.522723551352938e-03, 9.232955535331875e-03, 9.232955535327506e-03, 9.232955535330659e-03, 4.237026901463632e-03, 4.237026901464212e-03, 4.237026901463377e-03, 6.106499343749692e-03, 6.106499343748844e-03, 6.106499343750059e-03, 1.627360858046573e-03, 1.627360858046587e-03, 1.627360858046813e-03, 1.148548912222280e-03, 1.148548912221887e-03, 1.148548912221821e-03, 2.564399625003663e-03, 2.564399625004229e-03, 2.564399625002914e-03, 5.856076670044469e-03, 5.856076670043464e-03, 5.856076670043295e-03, 1.423643710751563e-03, 1.423643710751888e-03, 1.423643710751167e-03, 3.978120910726397e-03, 3.978120910728038e-03, 3.978120910727048e-03, 1.363809538497426e-03, 1.363809538497801e-03, 1.363809538497208e-03, 4.075959956903368e-04, 4.075959956909096e-04, 4.075959956900526e-04, 2.860089500953389e-03, 2.860089500953686e-03, 2.860089500953841e-03, 4.850559515934299e-03, 4.850559515933083e-03, 4.850559515932247e-03, 1.352971840215884e-02, 1.352971840216170e-02, 1.352971840215909e-02, 6.132094336152466e-03, 6.132094336152919e-03, 6.132094336153004e-03, 1.461675478856591e-02, 1.461675478855672e-02, 1.461675478856139e-02, 5.990023547122631e-03, 5.990023547123226e-03, 5.990023547122334e-03, 9.631452064974138e-03, 9.631452064972767e-03, 9.631452064974280e-03, 5.614990995964819e-03, 5.614990995964790e-03, 5.614990995963984e-03, 1.144090371383849e-03, 1.144090371383791e-03, 1.144090371383924e-03, 7.816866183700298e-03, 7.816866183699294e-03, 7.816866183700680e-03, 1.639566856148552e-02, 1.639566856149825e-02, 1.639566856148552e-02, 6.721523996645333e-03, 6.721523996646124e-03, 6.721523996646986e-03, 6.261479412226208e-03, 6.261479412226463e-03, 6.261479412227057e-03, 1.527689367197913e-02, 1.527689367198168e-02, 1.527689367197715e-02, 4.042885330071795e-03, 4.042885330072389e-03, 4.042885330072658e-03, 1.229463521022889e-02, 1.229463521022621e-02, 1.229463521022872e-02, 5.427737795462986e-03, 5.427737795463169e-03, 5.427737795462180e-03, 1.976199256754921e-03, 1.976199256754936e-03, 1.976199256754398e-03, 6.919752984901465e-03, 6.919752984901423e-03, 6.919752984901507e-03, 1.046027736959284e-02, 1.046027736959218e-02, 1.046027736959269e-02, 1.447481953654798e-02, 1.447481953654770e-02, 1.447481953654855e-02, 2.391550565412188e-03, 2.391550565412258e-03, 2.391550565412131e-03, 1.511375252393240e-02, 1.511375252392830e-02, 1.511375252393155e-02, 9.511095825311733e-03, 9.511095825312935e-03, 9.511095825312142e-03, 4.036967691998535e-03, 4.036967691999214e-03, 4.036967691998252e-03, 1.508260348095485e-03, 1.508260348095230e-03, 1.508260348095372e-03, 3.937346811175521e-03, 3.937346811175041e-03, 3.937346811175210e-03, 3.608140871956763e-03, 3.608140871956792e-03, 3.608140871956891e-03, 3.365526474882623e-03, 3.365526474883641e-03, 3.365526474884108e-03, 5.192349870771271e-03, 5.192349870771441e-03, 5.192349870772318e-03, 1.334407891882406e-02, 1.334407891882760e-02, 1.334407891882411e-02, 8.025809123555732e-03, 8.025809123554318e-03, 8.025809123554217e-03, 1.070887983557521e-02, 1.070887983557467e-02, 1.070887983557649e-02, 1.309736406162602e-02, 1.309736406162684e-02, 1.309736406162661e-02, 8.885655025694786e-03, 8.885655025694404e-03, 8.885655025693740e-03, 1.344113214741205e-02, 1.344113214741205e-02, 1.344113214741270e-02, 5.698568685004529e-03, 5.698568685005180e-03, 5.698568685005053e-03, 3.847402885188293e-03, 3.847402885188307e-03, 3.847402885188477e-03, 4.769112898361405e-03, 4.769112898361265e-03, 4.769112898362141e-03, 1.118686556204748e-02, 1.118686556204881e-02, 1.118686556204810e-02, 5.635998890136026e-03, 5.635998890136154e-03, 5.635998890136069e-03, 5.044085510105154e-03, 5.044085510104333e-03, 5.044085510104730e-03, 1.014203848072567e-02, 1.014203848072546e-02, 1.014203848072654e-02, 2.681590717335358e-03, 2.681590717334821e-03, 2.681590717335202e-03, 4.059907280598535e-03, 4.059907280598903e-03, 4.059907280598535e-03, 3.336448036565914e-03, 3.336448036565589e-03, 3.336448036566042e-03, 1.076375181138509e-02, 1.076375181138425e-02, 1.076375181138517e-02, 6.573170212491274e-03, 6.573170212491260e-03, 6.573170212491358e-03, 8.877430748418416e-04, 8.877430748418500e-04, 8.877430748417468e-04, 1.503300080640668e-03, 1.503300080640583e-03, 1.503300080640908e-03, 5.744672404119040e-03, 5.744672404119181e-03, 5.744672404119252e-03, 1.906496203051725e-03, 1.906496203050283e-03, 1.906496203049802e-03, 2.124665847459818e-03, 2.124665847459747e-03, 2.124665847459747e-03, 7.203086774524117e-04, 7.203086774524287e-04, 7.203086774520808e-04, 6.473262420273394e-04, 6.473262420272856e-04, 6.473262420267072e-04, 2.381499975344257e-03, 2.381499975344469e-03, 2.381499975345459e-03, 3.013274913110685e-03, 3.013274913110755e-03, 3.013274913111859e-03, 2.067897521108355e-03, 2.067897521107776e-03, 2.067897521108949e-03, 1.499773420159665e-02, 1.925991642486046e-02, 1.254797412755794e-02, 1.209377740627947e-02};
 
 
-void mesh_t::CubatureNodesTet3D(int cubTetN, int *_cubNp, dfloat **_cubr, dfloat **_cubs, dfloat **_cubt, dfloat **_cubw){
+void mesh_t::CubatureNodesTet3D(const int cubTetN,
+                                int& _cubNp,
+                                memory<dfloat>& _cubr,
+                                memory<dfloat>& _cubs,
+                                memory<dfloat>& _cubt,
+                                memory<dfloat>& _cubw){
 
-  if (cubTetN>15)
-    LIBP_ABORT(string("Requested Cubature order unavailable."))
+  LIBP_ABORT("Requested Cubature order unavailable.",
+             cubTetN>15);
 
-  int cubTetNp = cubTetNps[cubTetN-1];
+  _cubNp = cubTetNps[cubTetN-1];
 
-  *_cubNp = cubTetNp;
-
-  *_cubr = (dfloat*) calloc(cubTetNp, sizeof(dfloat));
-  *_cubs = (dfloat*) calloc(cubTetNp, sizeof(dfloat));
-  *_cubt = (dfloat*) calloc(cubTetNp, sizeof(dfloat));
-  *_cubw = (dfloat*) calloc(cubTetNp, sizeof(dfloat));
+  _cubr.malloc(_cubNp);
+  _cubs.malloc(_cubNp);
+  _cubt.malloc(_cubNp);
+  _cubw.malloc(_cubNp);
 
   const dfloat *cubTetR=NULL, *cubTetS=NULL, *cubTetT=NULL, *cubTetW=NULL;
   switch(cubTetN){
@@ -933,13 +1163,15 @@ void mesh_t::CubatureNodesTet3D(int cubTetN, int *_cubNp, dfloat **_cubr, dfloat
     case 14: cubTetR = cubR14; cubTetS = cubS14; cubTetT = cubT14; cubTetW = cubW14; break;
     case 15: cubTetR = cubR15; cubTetS = cubS15; cubTetT = cubT15; cubTetW = cubW15; break;
     default:
-      LIBP_ABORT(string("Requested Cubature order unavailable."))
+      LIBP_FORCE_ABORT("Requested Cubature order unavailable.");
   }
 
-  for(int n=0;n<cubTetNp;++n){
-    _cubr[0][n] = cubTetR[n];
-    _cubs[0][n] = cubTetS[n];
-    _cubt[0][n] = cubTetT[n];
-    _cubw[0][n] = cubTetW[n];
+  for(int n=0;n<_cubNp;++n){
+    _cubr[n] = cubTetR[n];
+    _cubs[n] = cubTetS[n];
+    _cubt[n] = cubTetT[n];
+    _cubw[n] = cubTetW[n];
   }
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshBasisTri2D.cpp b/libs/mesh/meshBasisTri2D.cpp
index c0dbb5fb6..a36ed6ed1 100644
--- a/libs/mesh/meshBasisTri2D.cpp
+++ b/libs/mesh/meshBasisTri2D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim WarburtonTim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,32 +25,32 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
+
+namespace libp {
 
 // ------------------------------------------------------------------------
 // TRI 2D NODES
 // ------------------------------------------------------------------------
-void mesh_t::NodesTri2D(int _N, dfloat *_r, dfloat *_s){
-
-  int _Np = (_N+1)*(_N+2)/2;
-
+void mesh_t::NodesTri2D(const int _N,
+                        memory<dfloat>& _r,
+                        memory<dfloat>& _s){
   EquispacedNodesTri2D(_N, _r, _s); //make equispaced nodes on reference triangle
-  WarpBlendTransformTri2D(_N, _Np, _r, _s); //apply warp&blend transform
+  WarpBlendTransformTri2D(_N, _r, _s); //apply warp&blend transform
 }
 
-void mesh_t::FaceNodesTri2D(int _N, dfloat *_r, dfloat *_s, int *_faceNodes){
-  int _Nfp = _N+1;
-  int _Np = (_N+1)*(_N+2)/2;
+void mesh_t::FaceNodesTri2D(const int _N,
+                            const memory<dfloat> _r,
+                            const memory<dfloat> _s,
+                            memory<int>& _faceNodes){
+  const int _Nfp = _N+1;
+  const int _Np = (_N+1)*(_N+2)/2;
 
   int cnt[3];
   for (int i=0;i<3;i++) cnt[i]=0;
 
-  dfloat deps = 1.;
-  while((1.+deps)>1.)
-    deps *= 0.5;
-
-  const dfloat NODETOL = 1000.*deps;
+  const dfloat NODETOL = 1.0e-5;
 
+  _faceNodes.malloc(3*_Nfp);
   for (int n=0;n<_Np;n++) {
     if(fabs(_s[n]+1)<NODETOL)
       _faceNodes[0*_Nfp+(cnt[0]++)] = n;
@@ -61,8 +61,11 @@ void mesh_t::FaceNodesTri2D(int _N, dfloat *_r, dfloat *_s, int *_faceNodes){
   }
 }
 
-void mesh_t::VertexNodesTri2D(int _N, dfloat *_r, dfloat *_s, int *_vertexNodes){
-  int _Np = (_N+1)*(_N+2)/2;
+void mesh_t::VertexNodesTri2D(const int _N,
+                              const memory<dfloat> _r,
+                              const memory<dfloat> _s,
+                              memory<int>& _vertexNodes){
+  const int _Np = (_N+1)*(_N+2)/2;
 
   dfloat deps = 1.;
   while((1.+deps)>1.)
@@ -70,6 +73,7 @@ void mesh_t::VertexNodesTri2D(int _N, dfloat *_r, dfloat *_s, int *_vertexNodes)
 
   const dfloat NODETOL = 1000.*deps;
 
+  _vertexNodes.malloc(3);
   for(int n=0;n<_Np;++n){
     if( (_r[n]+1)*(_r[n]+1)+(_s[n]+1)*(_s[n]+1)<NODETOL)
       _vertexNodes[0] = n;
@@ -80,8 +84,127 @@ void mesh_t::VertexNodesTri2D(int _N, dfloat *_r, dfloat *_s, int *_vertexNodes)
   }
 }
 
+/*Find a matching array between nodes on matching faces */
+void mesh_t::FaceNodeMatchingTri2D(const memory<dfloat> _r,
+                                   const memory<dfloat> _s,
+                                   const memory<int> _faceNodes,
+                                   const memory<int> _faceVertices,
+                                   memory<int>& R){
+
+  const int _Nfaces = 3;
+  const int _Nverts = 3;
+  const int _NfaceVertices = 2;
+
+  const int _Nfp = _faceNodes.length()/_Nfaces;
+
+  const dfloat NODETOL = 1.0e-5;
+
+  dfloat V[2] = {-1.0, 1.0};
+
+  dfloat EX0[_Nverts];
+  dfloat EX1[_Nverts];
+
+  memory<dfloat> x0(_Nfp);
+  memory<dfloat> x1(_Nfp);
+
+  R.malloc(_Nfaces*_Nfaces*_NfaceVertices*_Nfp);
+
+  for (int fM=0;fM<_Nfaces;fM++) {
+
+    for (int v=0;v<_Nverts;v++) {
+      EX0[v] = 0.0;
+    }
+    //setup top element with face fM on the bottom
+    for (int v=0;v<_NfaceVertices;v++) {
+      int fv = _faceVertices[fM*_NfaceVertices + v];
+      EX0[fv] = V[v];
+    }
+
+    for(int n=0;n<_Nfp;++n){ /* for each face node */
+      const int fn = _faceNodes[fM*_Nfp+n];
+
+      /* (r,s) coordinates of interpolation nodes*/
+      dfloat rn = _r[fn];
+      dfloat sn = _s[fn];
+
+      /* physical coordinate of interpolation node */
+      x0[n] = -0.5*(rn+sn)*EX0[0]
+             + 0.5*(1+rn)*EX0[1]
+             + 0.5*(1+sn)*EX0[2];
+    }
+
+    for (int fP=0;fP<_Nfaces;fP++) { /*For each neighbor face */
+      for (int rot=0;rot<_NfaceVertices;rot++) { /* For each face rotation */
+        // Zero vertices
+        for (int v=0;v<_Nverts;v++) {
+          EX1[v] = 0.0;
+        }
+        //setup bottom element with face fP on the top
+        for (int v=0;v<_NfaceVertices;v++) {
+          int fv = _faceVertices[fP*_NfaceVertices + ((v+rot)%_NfaceVertices)];
+          EX1[fv] = V[v];
+        }
+
+        for(int n=0;n<_Nfp;++n){ /* for each node */
+          const int fn = _faceNodes[fP*_Nfp+n];
+
+          /* (r,s,t) coordinates of interpolation nodes*/
+          dfloat rn = _r[fn];
+          dfloat sn = _s[fn];
+
+          /* physical coordinate of interpolation node */
+          x1[n] = -0.5*(rn+sn)*EX1[0]
+                 + 0.5*(1+rn)*EX1[1]
+                 + 0.5*(1+sn)*EX1[2];
+        }
+
+        /* for each node on this face find the neighbor node */
+        for(int n=0;n<_Nfp;++n){
+          const dfloat xM = x0[n];
+
+          int m=0;
+          for(;m<_Nfp;++m){ /* for each neighbor node */
+            const dfloat xP = x1[m];
+
+            /* distance between target and neighbor node */
+            const dfloat dist = pow(xM-xP,2);
+
+            /* if neighbor node is close to target, match */
+            if(dist<NODETOL){
+              R[fM*_Nfaces*_NfaceVertices*_Nfp
+                + fP*_NfaceVertices*_Nfp
+                + rot*_Nfp + n] = m;
+              break;
+            }
+          }
+
+          /*Check*/
+          const dfloat xP = x1[m];
+
+          /* distance between target and neighbor node */
+          const dfloat dist = pow(xM-xP,2);
+          //This shouldn't happen
+          LIBP_ABORT("Unable to match face node, face: " << fM
+                     << ", matching face: " << fP
+                     << ", rotation: " << rot
+                     << ", node: " << n
+                     << ". Is the reference node set not symmetric?",
+                     dist>NODETOL);
+        }
+      }
+    }
+  }
+}
+
 // Create equidistributed nodes on reference triangle
-void mesh_t::EquispacedNodesTri2D(int _N, dfloat *_r, dfloat *_s){
+void mesh_t::EquispacedNodesTri2D(const int _N,
+                                  memory<dfloat>& _r,
+                                  memory<dfloat>& _s){
+
+  const int _Np = (_N+1)*(_N+2)/2;
+
+  _r.malloc(_Np);
+  _s.malloc(_Np);
 
   int sk = 0;
   for (int n=0;n<_N+1;n++) {
@@ -93,8 +216,11 @@ void mesh_t::EquispacedNodesTri2D(int _N, dfloat *_r, dfloat *_s){
   }
 }
 
-void mesh_t::EquispacedEToVTri2D(int _N, int *_EToV){
-  int _Nverts = 3;
+void mesh_t::EquispacedEToVTri2D(const int _N, memory<int>& _EToV){
+  const int _Nverts = 3;
+  const int _Nelements = _N*_N;
+
+  _EToV.malloc(_Nelements*_Nverts);
 
   int cnt=0;
   int sk=0;
@@ -120,7 +246,10 @@ void mesh_t::EquispacedEToVTri2D(int _N, int *_EToV){
   }
 }
 
-void mesh_t::SEMFEMNodesTri2D(int _N, int *_Np, dfloat **_r, dfloat **_s){
+void mesh_t::SEMFEMNodesTri2D(const int _N,
+                              int& _Np,
+                              memory<dfloat>& _r,
+                              memory<dfloat>& _s){
 
   const dfloat alpopt[12] = {0.0000, 5.0000, 3.0000, 2.2073, 2.5259, 2.7113,
                              2.4368, 2.4564, 2.3948, 2.4346, 2.4653, 2.4691};
@@ -138,10 +267,9 @@ void mesh_t::SEMFEMNodesTri2D(int _N, int *_Np, dfloat **_r, dfloat **_s){
 
   const dfloat NODETOL = 1000.*deps;
 
-  *_Np = (_N+1)*(_N+6)/2;
-
-  *_r = (dfloat *) malloc((*_Np)*sizeof(dfloat));
-  *_s = (dfloat *) malloc((*_Np)*sizeof(dfloat));
+  _Np = (_N+1)*(_N+6)/2;
+  _r.malloc(_Np);
+  _s.malloc(_Np);
 
   int sk=0;
   //Order N+1 boundary
@@ -151,8 +279,8 @@ void mesh_t::SEMFEMNodesTri2D(int _N, int *_Np, dfloat **_r, dfloat **_s){
       dfloat ss = -1.0 + 2.0*n/(_N+1);
       if((fabs(ss+1)<NODETOL) || (fabs(rr+ss)<NODETOL)
          || (fabs(rr+1)<NODETOL)) {
-        (*_r)[sk] = rr;
-        (*_s)[sk] = ss;
+        _r[sk] = rr;
+        _s[sk] = ss;
         sk++;
       }
     }
@@ -161,67 +289,69 @@ void mesh_t::SEMFEMNodesTri2D(int _N, int *_Np, dfloat **_r, dfloat **_s){
   //Order N+2 interior
   for (int n=1;n<_N+3-1;n++) {
     for (int m=1;m<_N+3-n-1;m++) {
-      (*_r)[sk] = -1.0 + 2.0*m/(_N+2);
-      (*_s)[sk] = -1.0 + 2.0*n/(_N+2);
+      _r[sk] = -1.0 + 2.0*m/(_N+2);
+      _s[sk] = -1.0 + 2.0*n/(_N+2);
       sk++;
     }
   }
-  WarpBlendTransformTri2D(_N+1, *_Np, *_r, *_s, alpha); //apply warp&blend transform
+  WarpBlendTransformTri2D(_N+1, _r, _s, alpha); //apply warp&blend transform
 }
 
-void mesh_t::SEMFEMEToVTri2D(int _N, int *_NelFEM, int **_EToV){
-  int _Nverts = 3;
+void mesh_t::SEMFEMEToVTri2D(const int _N,
+                             int& _NelFEM,
+                             memory<int>& _EToV){
+  const int _Nverts = 3;
 
-  *_NelFEM = 6+6*(_N-1)+(_N-1)*(_N-1);
-  *_EToV = (int*) malloc((*_NelFEM)*_Nverts*sizeof(int));
+  _NelFEM = 6+6*(_N-1)+(_N-1)*(_N-1);
+  _EToV.malloc(_NelFEM*_Nverts);
 
   //start with corner quads
   int cnt=0;
   int corner = 3*(_N+1); //first interior point
-  (*_EToV)[cnt*_Nverts+0] = 0;
-  (*_EToV)[cnt*_Nverts+1] = 1;
-  (*_EToV)[cnt*_Nverts+2] = _N+2;
+  _EToV[cnt*_Nverts+0] = 0;
+  _EToV[cnt*_Nverts+1] = 1;
+  _EToV[cnt*_Nverts+2] = _N+2;
   cnt++;
 
-  (*_EToV)[cnt*_Nverts+0] = 1;
-  (*_EToV)[cnt*_Nverts+1] = corner;
-  (*_EToV)[cnt*_Nverts+2] = _N+2;
+  _EToV[cnt*_Nverts+0] = 1;
+  _EToV[cnt*_Nverts+1] = corner;
+  _EToV[cnt*_Nverts+2] = _N+2;
   cnt++;
 
   corner += _N-1; //bottom right interior point
-  (*_EToV)[cnt*_Nverts+0] = _N;
-  (*_EToV)[cnt*_Nverts+1] = _N+1;
-  (*_EToV)[cnt*_Nverts+2] = _N+3;
+  _EToV[cnt*_Nverts+0] = _N;
+  _EToV[cnt*_Nverts+1] = _N+1;
+  _EToV[cnt*_Nverts+2] = _N+3;
   cnt++;
 
-  (*_EToV)[cnt*_Nverts+0] = _N;
-  (*_EToV)[cnt*_Nverts+1] = _N+3;
-  (*_EToV)[cnt*_Nverts+2] = corner;
+  _EToV[cnt*_Nverts+0] = _N;
+  _EToV[cnt*_Nverts+1] = _N+3;
+  _EToV[cnt*_Nverts+2] = corner;
   cnt++;
 
   corner = (_N+1)*(_N+6)/2-1; //top interior point
-  (*_EToV)[cnt*_Nverts+0] = 3*_N;
-  (*_EToV)[cnt*_Nverts+1] = 3*_N+1;
-  (*_EToV)[cnt*_Nverts+2] = 3*_N+2;
+  _EToV[cnt*_Nverts+0] = 3*_N;
+  _EToV[cnt*_Nverts+1] = 3*_N+1;
+  _EToV[cnt*_Nverts+2] = 3*_N+2;
   cnt++;
 
-  (*_EToV)[cnt*_Nverts+0] = 3*_N;
-  (*_EToV)[cnt*_Nverts+1] = corner;
-  (*_EToV)[cnt*_Nverts+2] = 3*_N+1;
+  _EToV[cnt*_Nverts+0] = 3*_N;
+  _EToV[cnt*_Nverts+1] = corner;
+  _EToV[cnt*_Nverts+2] = 3*_N+1;
   cnt++;
 
   //next the edges
   corner = 3*(_N+1); //first interior point
   int inc = 1; // increment to next interior point along this edge
   for (int i=0;i<_N-1;i++) {
-    (*_EToV)[cnt*_Nverts+0] = i+1;
-    (*_EToV)[cnt*_Nverts+1] = i+2;
-    (*_EToV)[cnt*_Nverts+2] = corner;
+    _EToV[cnt*_Nverts+0] = i+1;
+    _EToV[cnt*_Nverts+1] = i+2;
+    _EToV[cnt*_Nverts+2] = corner;
     cnt++;
 
-    (*_EToV)[cnt*_Nverts+0] = i+2;
-    (*_EToV)[cnt*_Nverts+1] = corner+inc;
-    (*_EToV)[cnt*_Nverts+2] = corner;
+    _EToV[cnt*_Nverts+0] = i+2;
+    _EToV[cnt*_Nverts+1] = corner+inc;
+    _EToV[cnt*_Nverts+2] = corner;
     cnt++;
     corner += inc;
   }
@@ -229,14 +359,14 @@ void mesh_t::SEMFEMEToVTri2D(int _N, int *_NelFEM, int **_EToV){
   corner = 3*(_N+1); //first interior point
   inc = _N; // increment to next interior point along this edge
   for (int i=0;i<_N-1;i++) {
-    (*_EToV)[cnt*_Nverts+0] = _N+2+2*i;
-    (*_EToV)[cnt*_Nverts+1] = corner;
-    (*_EToV)[cnt*_Nverts+2] = _N+4+2*i;
+    _EToV[cnt*_Nverts+0] = _N+2+2*i;
+    _EToV[cnt*_Nverts+1] = corner;
+    _EToV[cnt*_Nverts+2] = _N+4+2*i;
     cnt++;
 
-    (*_EToV)[cnt*_Nverts+0] = corner;
-    (*_EToV)[cnt*_Nverts+1] = corner+inc;
-    (*_EToV)[cnt*_Nverts+2] = _N+4+2*i;
+    _EToV[cnt*_Nverts+0] = corner;
+    _EToV[cnt*_Nverts+1] = corner+inc;
+    _EToV[cnt*_Nverts+2] = _N+4+2*i;
     cnt++;
     corner += inc;
     inc--;
@@ -245,14 +375,14 @@ void mesh_t::SEMFEMEToVTri2D(int _N, int *_NelFEM, int **_EToV){
   corner = 3*(_N+1)+_N-1; //bottom right interior point
   inc = _N-1; // increment to next interior point along this edge
   for (int i=0;i<_N-1;i++) {
-    (*_EToV)[cnt*_Nverts+0] = corner;
-    (*_EToV)[cnt*_Nverts+1] = _N+3+2*i;
-    (*_EToV)[cnt*_Nverts+2] = _N+5+2*i;
+    _EToV[cnt*_Nverts+0] = corner;
+    _EToV[cnt*_Nverts+1] = _N+3+2*i;
+    _EToV[cnt*_Nverts+2] = _N+5+2*i;
     cnt++;
 
-    (*_EToV)[cnt*_Nverts+0] = corner;
-    (*_EToV)[cnt*_Nverts+1] = _N+5+2*i;
-    (*_EToV)[cnt*_Nverts+2] = corner+inc;
+    _EToV[cnt*_Nverts+0] = corner;
+    _EToV[cnt*_Nverts+1] = _N+5+2*i;
+    _EToV[cnt*_Nverts+2] = corner+inc;
     cnt++;
     corner += inc;
     inc--;
@@ -265,15 +395,15 @@ void mesh_t::SEMFEMEToVTri2D(int _N, int *_NelFEM, int **_EToV){
     int shift = _N-j; //number of nodes in this row
 
     for (int i=0;i<_N-j-1;i++) {
-      (*_EToV)[cnt*_Nverts+0] = sk  ;
-      (*_EToV)[cnt*_Nverts+1] = sk+1;
-      (*_EToV)[cnt*_Nverts+2] = sk+shift;
+      _EToV[cnt*_Nverts+0] = sk  ;
+      _EToV[cnt*_Nverts+1] = sk+1;
+      _EToV[cnt*_Nverts+2] = sk+shift;
       cnt++;
 
       if (i!=_N-j-2) {
-        (*_EToV)[cnt*_Nverts+0] = sk+1;
-        (*_EToV)[cnt*_Nverts+1] = sk+shift+1;
-        (*_EToV)[cnt*_Nverts+2] = sk+shift;
+        _EToV[cnt*_Nverts+0] = sk+1;
+        _EToV[cnt*_Nverts+1] = sk+shift+1;
+        _EToV[cnt*_Nverts+2] = sk+shift;
         cnt++;
       }
       sk++;
@@ -285,7 +415,9 @@ void mesh_t::SEMFEMEToVTri2D(int _N, int *_NelFEM, int **_EToV){
 // ------------------------------------------------------------------------
 // ORTHONORMAL BASIS POLYNOMIALS
 // ------------------------------------------------------------------------
-void mesh_t::OrthonormalBasisTri2D(dfloat _r, dfloat _s, int i, int j, dfloat *P){
+void mesh_t::OrthonormalBasisTri2D(const dfloat _r, const dfloat _s,
+                                   const int i, const int j,
+                                   dfloat& P){
   dfloat a,b;
   if(_s != 1.)
     a = 2.*(1.+_r)/(1.-_s)-1.;
@@ -293,10 +425,12 @@ void mesh_t::OrthonormalBasisTri2D(dfloat _r, dfloat _s, int i, int j, dfloat *P
     a = -1.;
   b=_s;
 
-  *P = sqrt(2.0)*JacobiP(a,0,0,i)*JacobiP(b,2*i+1,0,j)*pow(1.-b,i);
+  P = sqrt(2.0)*JacobiP(a,0,0,i)*JacobiP(b,2*i+1,0,j)*pow(1.-b,i);
 }
 
-void mesh_t::GradOrthonormalBasisTri2D(dfloat _r, dfloat _s, int i, int j, dfloat *Pr, dfloat *Ps){
+void mesh_t::GradOrthonormalBasisTri2D(const dfloat _r, const dfloat _s,
+                                       const int i, const int j,
+                                       dfloat& Pr, dfloat& Ps){
   dfloat a,b;
   if(_s != 1.)
     a = 2.*(1.+_r)/(1.-_s)-1.;
@@ -309,56 +443,68 @@ void mesh_t::GradOrthonormalBasisTri2D(dfloat _r, dfloat _s, int i, int j, dfloa
 
   // r-derivative
   // d/dr = da/dr d/da + db/dr d/db = (2/(1-s)) d/da = (2/(1-b)) d/da
-  (*Pr) = dfa*gb;
+  Pr = dfa*gb;
   if(i>0)
-    (*Pr) = (*Pr)*pow(0.5*(1-b),i-1);
+    Pr = Pr*pow(0.5*(1-b),i-1);
 
   // s-derivative
   // d/ds = ((1+a)/2)/((1-b)/2) d/da + d/db
-  (*Ps) = dfa*(gb*(0.5*(1+a)));
+  Ps = dfa*(gb*(0.5*(1+a)));
   if(i>0)
-   (*Ps) = (*Ps)*pow(0.5*(1-b),i-1);
+   Ps = Ps*pow(0.5*(1-b),i-1);
 
   dfloat tmp = dgb*pow(0.5*(1-b),i);
   if(i>0)
     tmp = tmp-0.5*i*gb*pow(0.5*(1-b),i-1);
 
-  (*Ps) = (*Ps)+fa*tmp;
+  Ps = Ps+fa*tmp;
 
   // Normalize
-  (*Pr) *= pow(2,i+0.5); (*Ps) *= pow(2,i+0.5);
+  Pr *= pow(2,i+0.5); Ps *= pow(2,i+0.5);
 }
 
 // ------------------------------------------------------------------------
 // 2D VANDERMONDE MATRICES
 // ------------------------------------------------------------------------
 
-void mesh_t::VandermondeTri2D(int _N, int Npoints, dfloat *_r, dfloat *_s, dfloat *V){
+void mesh_t::VandermondeTri2D(const int _N,
+                              const memory<dfloat> _r,
+                              const memory<dfloat> _s,
+                              memory<dfloat>& V){
 
-  int _Np = (_N+1)*(_N+2)/2;
+  const int _Np = (_N+1)*(_N+2)/2;
+  const int Npoints = _r.length();
 
+  V.malloc(Npoints*_Np);
   for(int n=0; n<Npoints; n++){
     int sk=0;
     for(int i=0; i<_N+1; i++){
       for(int j=0; j<_N+1-i; j++){
         int id = n*_Np+sk;
-        OrthonormalBasisTri2D(_r[n], _s[n], i, j, V+id);
+        OrthonormalBasisTri2D(_r[n], _s[n], i, j, V[id]);
         sk++;
       }
     }
   }
 }
 
-void mesh_t::GradVandermondeTri2D(int _N, int Npoints, dfloat *_r, dfloat *_s, dfloat *Vr, dfloat *Vs){
+void mesh_t::GradVandermondeTri2D(const int _N,
+                                  const memory<dfloat> _r,
+                                  const memory<dfloat> _s,
+                                  memory<dfloat>& Vr,
+                                  memory<dfloat>& Vs){
 
-  int _Np = (_N+1)*(_N+2)/2;
+  const int _Np = (_N+1)*(_N+2)/2;
+  const int Npoints = _r.length();
 
+  Vr.malloc(Npoints*_Np);
+  Vs.malloc(Npoints*_Np);
   for(int n=0; n<Npoints; n++){
     int sk=0;
     for(int i=0; i<_N+1; i++){
       for(int j=0; j<_N+1-i; j++){
         int id = n*_Np+sk;
-        GradOrthonormalBasisTri2D(_r[n], _s[n], i, j, Vr+id, Vs+id);
+        GradOrthonormalBasisTri2D(_r[n], _s[n], i, j, Vr[id], Vs[id]);
         sk++;
       }
     }
@@ -368,9 +514,12 @@ void mesh_t::GradVandermondeTri2D(int _N, int Npoints, dfloat *_r, dfloat *_s, d
 // ------------------------------------------------------------------------
 // 2D OPERATOR MATRICES
 // ------------------------------------------------------------------------
-void mesh_t::MassMatrixTri2D(int _Np, dfloat *V, dfloat *_MM){
+void mesh_t::MassMatrixTri2D(const int _Np,
+                             const memory<dfloat> V,
+                             memory<dfloat>& _MM){
 
   // massMatrix = inv(V')*inv(V) = inv(V*V')
+  _MM.malloc(_Np*_Np);
   for(int n=0;n<_Np;++n){
     for(int m=0;m<_Np;++m){
       dfloat res = 0;
@@ -380,12 +529,15 @@ void mesh_t::MassMatrixTri2D(int _Np, dfloat *V, dfloat *_MM){
       _MM[n*_Np + m] = res;
     }
   }
-  matrixInverse(_Np, _MM);
+  linAlg_t::matrixInverse(_Np, _MM);
 }
 
-void mesh_t::invMassMatrixTri2D(int _Np, dfloat *V, dfloat *_invMM){
+void mesh_t::invMassMatrixTri2D(const int _Np,
+                                const memory<dfloat> V,
+                                memory<dfloat>& _invMM){
 
   // massMatrix^{-1} = V*V'
+  _invMM.malloc(_Np*_Np);
   for(int n=0;n<_Np;++n){
     for(int m=0;m<_Np;++m){
       dfloat res = 0;
@@ -397,40 +549,41 @@ void mesh_t::invMassMatrixTri2D(int _Np, dfloat *V, dfloat *_invMM){
   }
 }
 
-void mesh_t::DmatrixTri2D(int _N, int Npoints, dfloat *_r, dfloat *_s,
-                                                dfloat *_Dr, dfloat *_Ds){
+void mesh_t::DmatrixTri2D(const int _N,
+                          const memory<dfloat> _r,
+                          const memory<dfloat> _s,
+                          memory<dfloat>& _D){
 
-  int _Np = (_N+1)*(_N+2)/2;
+  const int _Np = (_N+1)*(_N+2)/2;
 
-  dfloat *V  = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat));
-  dfloat *Vr = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat));
-  dfloat *Vs = (dfloat *) calloc(Npoints*_Np, sizeof(dfloat));
-
-  VandermondeTri2D(_N, Npoints, _r, _s, V);
-  GradVandermondeTri2D(_N, Npoints, _r, _s, Vr, Vs);
+  memory<dfloat> V, Vr, Vs;
+  VandermondeTri2D(_N, _r, _s, V);
+  GradVandermondeTri2D(_N, _r, _s, Vr, Vs);
 
   //Dr = Vr/V, Ds = Vs/V
-  matrixRightSolve(_Np, _Np, Vr, _Np, _Np, V, _Dr);
-  matrixRightSolve(_Np, _Np, Vs, _Np, _Np, V, _Ds);
-
-  free(V); free(Vr); free(Vs);
+  _D.malloc(2*_Np*_Np);
+  memory<dfloat> _Dr = _D + 0*_Np*_Np;
+  memory<dfloat> _Ds = _D + 1*_Np*_Np;
+  linAlg_t::matrixRightSolve(_Np, _Np, Vr, _Np, _Np, V, _Dr);
+  linAlg_t::matrixRightSolve(_Np, _Np, Vs, _Np, _Np, V, _Ds);
 }
 
-void mesh_t::LIFTmatrixTri2D(int _N, int *_faceNodes,
-                             dfloat *_r, dfloat *_s, dfloat *_LIFT){
+void mesh_t::LIFTmatrixTri2D(const int _N,
+                             const memory<int> _faceNodes,
+                             const memory<dfloat> _r,
+                             const memory<dfloat> _s,
+                             memory<dfloat>& _LIFT){
 
-  int _Nfp = (_N+1);
-  int _Np = (_N+1)*(_N+2)/2;
-  int _Nfaces = 3;
+  const int _Nfp = (_N+1);
+  const int _Np = (_N+1)*(_N+2)/2;
+  const int _Nfaces = 3;
 
-  dfloat *E = (dfloat *) calloc(_Np*_Nfaces*_Nfp, sizeof(dfloat));
+  memory<dfloat> E(_Np*_Nfaces*_Nfp, 0);
 
-  dfloat *r1D = (dfloat *) malloc(_Nfp*sizeof(dfloat));
-  dfloat *V1D = (dfloat *) malloc(_Nfp*_Nfp*sizeof(dfloat));
-  dfloat *MM1D = (dfloat *) malloc(_Nfp*_Nfp*sizeof(dfloat));
+  memory<dfloat> r1D(_Nfp);
 
   for (int f=0;f<_Nfaces;f++) {
-    dfloat *rFace;
+    memory<dfloat> rFace;
     if (f==0) rFace = _r;
     if (f==1) rFace = _r;
     if (f==2) rFace = _s;
@@ -438,7 +591,8 @@ void mesh_t::LIFTmatrixTri2D(int _N, int *_faceNodes,
     for (int i=0;i<_Nfp;i++)
       r1D[i] = rFace[_faceNodes[f*_Nfp+i]];
 
-    Vandermonde1D(_N, _Nfp, r1D, V1D);
+    memory<dfloat> V1D, MM1D;
+    Vandermonde1D(_N, r1D, V1D);
     MassMatrix1D(_Nfp, V1D, MM1D);
 
     for (int j=0;j<_Nfp;j++) {
@@ -449,9 +603,10 @@ void mesh_t::LIFTmatrixTri2D(int _N, int *_faceNodes,
     }
   }
 
-  dfloat *V = (dfloat *) malloc(_Np*_Np*sizeof(dfloat));
-  VandermondeTri2D(_N, _Np, _r, _s, V);
+  memory<dfloat> V;
+  VandermondeTri2D(_N, _r, _s, V);
 
+  _LIFT.malloc(_Np*_Nfaces*_Nfp);
   for (int n=0;n<_Np;n++) {
     for (int m=0;m<_Nfaces*_Nfp;m++) {
 
@@ -465,16 +620,18 @@ void mesh_t::LIFTmatrixTri2D(int _N, int *_faceNodes,
       }
     }
   }
-
-  free(V); free(r1D); free(V1D); free(MM1D); free(E);
 }
 
-void mesh_t::SurfaceMassMatrixTri2D(int _N, dfloat *_MM, dfloat *_LIFT, dfloat *_sM){
+void mesh_t::SurfaceMassMatrixTri2D(const int _N,
+                                    const memory<dfloat> _MM,
+                                    const memory<dfloat> _LIFT,
+                                    memory<dfloat>& _sM){
 
-  int _Nfp = (_N+1);
-  int _Np = (_N+1)*(_N+2)/2;
-  int _Nfaces = 3;
+  const int _Nfp = (_N+1);
+  const int _Np = (_N+1)*(_N+2)/2;
+  const int _Nfaces = 3;
 
+  _sM.malloc(_Np*_Nfaces*_Nfp);
   for (int n=0;n<_Np;n++) {
     for (int m=0;m<_Nfp*_Nfaces;m++) {
       _sM[m+n*_Nfp*_Nfaces] = 0;
@@ -485,11 +642,18 @@ void mesh_t::SurfaceMassMatrixTri2D(int _N, dfloat *_MM, dfloat *_LIFT, dfloat *
   }
 }
 
-void mesh_t::SmatrixTri2D(int _N, dfloat *_Dr, dfloat *_Ds, dfloat *_MM,
-                          dfloat *_Srr, dfloat *_Srs, dfloat *_Sss){
+void mesh_t::SmatrixTri2D(const int _N,
+                          const memory<dfloat> _Dr,
+                          const memory<dfloat> _Ds,
+                          const memory<dfloat> _MM,
+                          memory<dfloat>& _S){
 
-  int _Np = (_N+1)*(_N+2)/2;
+  const int _Np = (_N+1)*(_N+2)/2;
 
+  _S.malloc(3*_Np*_Np, 0.0);
+  memory<dfloat> _Srr = _S + 0*_Np*_Np;
+  memory<dfloat> _Srs = _S + 1*_Np*_Np;
+  memory<dfloat> _Sss = _S + 2*_Np*_Np;
   for (int n=0;n<_Np;n++) {
     for (int m=0;m<_Np;m++) {
       for (int k=0;k<_Np;k++) {
@@ -504,56 +668,61 @@ void mesh_t::SmatrixTri2D(int _N, dfloat *_Dr, dfloat *_Ds, dfloat *_MM,
   }
 }
 
-void mesh_t::InterpolationMatrixTri2D(int _N,
-                               int NpointsIn, dfloat *rIn, dfloat *sIn,
-                               int NpointsOut, dfloat *rOut, dfloat *sOut,
-                               dfloat *I){
+void mesh_t::InterpolationMatrixTri2D(const int _N,
+                                      const memory<dfloat> rIn,
+                                      const memory<dfloat> sIn,
+                                      const memory<dfloat> rOut,
+                                      const memory<dfloat> sOut,
+                                      memory<dfloat>& I){
 
-  int _Np = (_N+1)*(_N+2)/2;
+  const int _Np = (_N+1)*(_N+2)/2;
 
-  // need NpointsIn = _Np
-  if (NpointsIn != _Np)
-    LIBP_ABORT(string("Invalid Interplation operator requested."))
+  const int NpointsIn  = rIn.length();
+  const int NpointsOut = rOut.length();
 
-  dfloat *VIn = (dfloat*) malloc(NpointsIn*_Np*sizeof(dfloat));
-  dfloat *VOut= (dfloat*) malloc(NpointsOut*_Np*sizeof(dfloat));
-
-  VandermondeTri2D(_N, NpointsIn,   rIn, sIn, VIn);
-  VandermondeTri2D(_N, NpointsOut, rOut, sOut, VOut);
+  // need NpointsIn = _Np
+  LIBP_ABORT("Invalid Interplation operator requested.",
+             NpointsIn != _Np);
 
-  matrixRightSolve(NpointsOut, _Np, VOut, NpointsIn, _Np, VIn, I);
+  memory<dfloat> VIn;
+  memory<dfloat> VOut;
+  VandermondeTri2D(_N, rIn, sIn, VIn);
+  VandermondeTri2D(_N, rOut, sOut, VOut);
 
-  free(VIn); free(VOut);
+  I.malloc(NpointsIn*NpointsOut);
+  linAlg_t::matrixRightSolve(NpointsOut, _Np, VOut,
+                             NpointsIn, _Np, VIn, I);
 }
 
-void mesh_t::DegreeRaiseMatrixTri2D(int Nc, int Nf, dfloat *P){
-
-  int Npc = (Nc+1)*(Nc+2)/2;
-  int Npf = (Nf+1)*(Nf+2)/2;
-
-  dfloat *rc = (dfloat *) malloc(Npc*sizeof(dfloat));
-  dfloat *sc = (dfloat *) malloc(Npc*sizeof(dfloat));
-  dfloat *rf = (dfloat *) malloc(Npf*sizeof(dfloat));
-  dfloat *sf = (dfloat *) malloc(Npf*sizeof(dfloat));
+void mesh_t::DegreeRaiseMatrixTri2D(const int Nc, const int Nf,
+                                    memory<dfloat>& P){
 
+  memory<dfloat> rc, sc;
+  memory<dfloat> rf, sf;
   NodesTri2D(Nc, rc, sc);
   NodesTri2D(Nf, rf, sf);
 
-  InterpolationMatrixTri2D(Nc, Npc, rc, sc, Npf, rf, sf, P);
-
-  free(rc); free(sc); free(rf); free(sf);
+  InterpolationMatrixTri2D(Nc, rc, sc, rf, sf, P);
 }
 
-void mesh_t::CubaturePmatrixTri2D(int _N, int _Np, dfloat *_r, dfloat *_s,
-                                  int _cubNp, dfloat *_cubr, dfloat *_cubs, dfloat *_cubProject){
+void mesh_t::CubaturePmatrixTri2D(const int _N,
+                                  const memory<dfloat> _r,
+                                  const memory<dfloat> _s,
+                                  const memory<dfloat> _cubr,
+                                  const memory<dfloat> _cubs,
+                                  memory<dfloat>& _cubProject){
+
+  const int _Np = (_N+1)*(_N+2)/2;
+  const int _cubNp = _cubr.length();
 
-  dfloat *V = (dfloat*) malloc(_Np*_Np*sizeof(dfloat));
-  VandermondeTri2D(_N, _Np, _r, _s, V);
+  memory<dfloat> V;
+  VandermondeTri2D(_N, _r, _s, V);
 
-  dfloat *cubV  = (dfloat*) malloc(_cubNp*_Np*sizeof(dfloat));
-  VandermondeTri2D(_N, _cubNp, _cubr, _cubs, cubV);
+  memory<dfloat> cubV;
+  VandermondeTri2D(_N, _cubr, _cubs, cubV);
 
   // cubProject = V*cV' %% relies on (transpose(cV)*diag(cubw)*cV being the identity)
+  _cubProject.malloc(_Np*_cubNp);
   for(int n=0;n<_Np;++n){
     for(int m=0;m<_cubNp;++m){
       dfloat resP = 0;
@@ -563,24 +732,30 @@ void mesh_t::CubaturePmatrixTri2D(int _N, int _Np, dfloat *_r, dfloat *_s,
       _cubProject[n*_cubNp+m] = resP;
     }
   }
-  free(V); free(cubV);
 }
 
-void mesh_t::CubatureWeakDmatricesTri2D(int _N, int _Np, dfloat *_r, dfloat *_s,
-                                        int _cubNp, dfloat *_cubr, dfloat *_cubs,
-                                        dfloat *_cubPDrT, dfloat *_cubPDsT){
+void mesh_t::CubatureWeakDmatricesTri2D(const int _N,
+                                        const memory<dfloat> _r,
+                                        const memory<dfloat> _s,
+                                        const memory<dfloat> _cubr,
+                                        const memory<dfloat> _cubs,
+                                        memory<dfloat>& _cubPDT){
 
-  dfloat *V = (dfloat*) malloc(_Np*_Np*sizeof(dfloat));
-  VandermondeTri2D(_N, _Np, _r, _s, V);
+  const int _Np = (_N+1)*(_N+2)/2;
+  const int _cubNp = _cubr.length();
 
-  dfloat *cubV  = (dfloat*) malloc(_cubNp*_Np*sizeof(dfloat));
-  dfloat *cubVr = (dfloat*) malloc(_cubNp*_Np*sizeof(dfloat));
-  dfloat *cubVs = (dfloat*) malloc(_cubNp*_Np*sizeof(dfloat));
-  VandermondeTri2D(_N, _cubNp, _cubr, _cubs, cubV);
-  GradVandermondeTri2D(_N, _cubNp, _cubr, _cubs, cubVr, cubVs);
+  memory<dfloat> V;
+  VandermondeTri2D(_N, _r, _s, V);
+
+  memory<dfloat> cubV, cubVr, cubVs;
+  VandermondeTri2D(_N, _cubr, _cubs, cubV);
+  GradVandermondeTri2D(_N, _cubr, _cubs, cubVr, cubVs);
 
   // cubPDrT = V*transpose(cVr);
   // cubPDsT = V*transpose(cVs);
+  _cubPDT.malloc(2*_Np*_cubNp);
+  memory<dfloat> _cubPDrT = _cubPDT + 0*_Np*_cubNp;
+  memory<dfloat> _cubPDsT = _cubPDT + 1*_Np*_cubNp;
   for(int n=0;n<_Np;++n){
     for(int m=0;m<_cubNp;++m){
       dfloat resPDrT = 0, resPDsT = 0;
@@ -593,22 +768,28 @@ void mesh_t::CubatureWeakDmatricesTri2D(int _N, int _Np, dfloat *_r, dfloat *_s,
       _cubPDsT[n*_cubNp+m] = resPDsT;
     }
   }
-  free(V); free(cubV); free(cubVr); free(cubVs);
 }
 
-void mesh_t::CubatureSurfaceMatricesTri2D(int _N, int _Np, dfloat *_r, dfloat *_s, int *_faceNodes,
-                                    int _intNfp, dfloat *_intr, dfloat *_intw,
-                                    dfloat *_intInterp, dfloat *_intLIFT){
+void mesh_t::CubatureSurfaceMatricesTri2D(const int _N,
+                                          const memory<dfloat> _r,
+                                          const memory<dfloat> _s,
+                                          const memory<int> _faceNodes,
+                                          const memory<dfloat> _intr,
+                                          const memory<dfloat> _intw,
+                                          memory<dfloat>& _intInterp,
+                                          memory<dfloat>& _intLIFT){
 
-  int _Nfaces = 3;
-  int _Nfp = _N+1;
+  const int _Np = (_N+1)*(_N+2)/2;
+  const int _Nfaces = 3;
+  const int _Nfp = _N+1;
+  const int _intNfp = _intr.length();
 
-  dfloat *V = (dfloat*) malloc(_Np*_Np*sizeof(dfloat));
-  VandermondeTri2D(_N, _Np, _r, _s, V);
+  memory<dfloat> V;
+  VandermondeTri2D(_N, _r, _s, V);
 
-  dfloat *ir = (dfloat*) calloc(_intNfp*_Nfaces, sizeof(dfloat));
-  dfloat *is = (dfloat*) calloc(_intNfp*_Nfaces, sizeof(dfloat));
-  dfloat *iw = (dfloat*) calloc(_intNfp*_Nfaces, sizeof(dfloat));
+  memory<dfloat> ir(_intNfp*_Nfaces);
+  memory<dfloat> is(_intNfp*_Nfaces);
+  memory<dfloat> iw(_intNfp*_Nfaces);
 
   for(int n=0;n<_intNfp;++n){
     ir[0*_intNfp + n] =  _intr[n];
@@ -624,9 +805,10 @@ void mesh_t::CubatureSurfaceMatricesTri2D(int _N, int _Np, dfloat *_r, dfloat *_
     iw[2*_intNfp + n] =  _intw[n];
   }
 
-  dfloat *sInterp = (dfloat*) malloc(_intNfp*_Nfaces*_Np*sizeof(dfloat));
-  InterpolationMatrixTri2D(_N, _Np, _r, _s, _Nfaces*_intNfp, ir, is, sInterp);
+  memory<dfloat> sInterp;
+  InterpolationMatrixTri2D(_N, _r, _s, ir, is, sInterp);
 
+  _intInterp.malloc(_Nfaces*_intNfp*_Nfp);
   for(int n=0;n<_intNfp;++n){
     for(int m=0;m<_Nfp;++m){
       _intInterp[0*_intNfp*_Nfp + n*_Nfp + m] = sInterp[(n+0*_intNfp)*_Np+_faceNodes[0*_Nfp+m]];
@@ -637,6 +819,7 @@ void mesh_t::CubatureSurfaceMatricesTri2D(int _N, int _Np, dfloat *_r, dfloat *_
 
   // integration node lift matrix
   //iLIFT = V*V'*sInterp'*diag(iw(:));
+  _intLIFT.malloc(_Nfaces*_intNfp*_Np);
   for(int n=0;n<_Nfaces*_intNfp;++n){
     for(int m=0;m<_Np;++m){
       _intLIFT[m*_Nfaces*_intNfp+n] = 0.0;
@@ -647,19 +830,22 @@ void mesh_t::CubatureSurfaceMatricesTri2D(int _N, int _Np, dfloat *_r, dfloat *_
       }
     }
   }
-
-  free(V); free(ir);  free(is);  free(iw);  free(sInterp);
 }
 
-void mesh_t::SEMFEMInterpMatrixTri2D(int _N,
-                                    int _Np, dfloat *_r, dfloat *_s,
-                                    int _NpFEM, dfloat *_rFEM, dfloat *_sFEM,
-                                    dfloat *I){
+void mesh_t::SEMFEMInterpMatrixTri2D(const int _N,
+                                     const memory<dfloat> _r,
+                                     const memory<dfloat> _s,
+                                     const memory<dfloat> _rFEM,
+                                     const memory<dfloat> _sFEM,
+                                     memory<dfloat>& I){
 
-  dfloat *IQN = (dfloat*) malloc(_NpFEM*_Np*sizeof(dfloat));
-  InterpolationMatrixTri2D(_N, _Np, _r, _s, _NpFEM, _rFEM, _sFEM, IQN);
+  const int _Np = (_N+1)*(_N+2)/2;
+  const int _NpFEM = _rFEM.length();
 
-  dfloat *IQTIQ = (dfloat*) malloc(_Np*_Np*sizeof(dfloat));
+  memory<dfloat> IQN;
+  InterpolationMatrixTri2D(_N, _r, _s, _rFEM, _sFEM, IQN);
+
+  memory<dfloat> IQTIQ(_Np*_Np);
   // IQTIQ = IQN'*IQN
   for(int n=0;n<_Np;++n){
     for(int m=0;m<_Np;++m){
@@ -671,9 +857,8 @@ void mesh_t::SEMFEMInterpMatrixTri2D(int _N,
   }
 
   // I = IQN/(IQN'*IQN)  - pseudo inverse
-  matrixRightSolve(_NpFEM, _Np, IQN, _Np, _Np, IQTIQ, I);
-
-  free(IQN); free(IQTIQ);
+  I.malloc(_NpFEM*_Np);
+  linAlg_t::matrixRightSolve(_NpFEM, _Np, IQN, _Np, _Np, IQTIQ, I);
 }
 
 // ------------------------------------------------------------------------
@@ -682,19 +867,22 @@ void mesh_t::SEMFEMInterpMatrixTri2D(int _N,
 //                       Journal of engineering mathematics, 56(3), 247-262.
 // ------------------------------------------------------------------------
 
-void mesh_t::Warpfactor(int _N, int Npoints, dfloat *_r, dfloat *warp) {
+void mesh_t::Warpfactor(const int _N,
+                        const memory<dfloat> _r,
+                        memory<dfloat> warp) {
   // Compute scaled warp function at order N
   // based on rout interpolation nodes
+  const int Npoints = _r.length();
 
   // Compute GLL and equidistant node distribution
-  dfloat *GLLr = (dfloat *) malloc((_N+1)*sizeof(dfloat));
-  dfloat *req  = (dfloat *) malloc((_N+1)*sizeof(dfloat));
+  memory<dfloat> GLLr;
+  memory<dfloat> req;
   JacobiGLL(_N, GLLr);
   EquispacedNodes1D(_N, req);
 
   // Make interpolation from req to r
-  dfloat *I = (dfloat*) malloc((_N+1)*Npoints*sizeof(dfloat));
-  InterpolationMatrix1D(_N, _N+1, req, Npoints, _r, I);
+  memory<dfloat> I;
+  InterpolationMatrix1D(_N, req, _r, I);
 
   // Compute warp factor
   for (int n=0;n<Npoints;n++) {
@@ -705,29 +893,37 @@ void mesh_t::Warpfactor(int _N, int Npoints, dfloat *_r, dfloat *warp) {
     }
 
     // Scale factor
-    dfloat zerof = (abs(_r[n])<1.0-1.0e-10) ? 1 : 0;
+    dfloat zerof = (std::abs(_r[n])<1.0-1.0e-10) ? 1 : 0;
     dfloat sf = 1.0 - (zerof*_r[n])*(zerof*_r[n]);
     warp[n] = warp[n]/sf + warp[n]*(zerof-1);
   }
-
-  free(GLLr); free(req);
 }
 
-static void xytors(int Npoints, dfloat *x, dfloat *y, dfloat *r, dfloat *s) {
+static void xytors(const memory<dfloat> x,
+                   const memory<dfloat> y,
+                   memory<dfloat> r,
+                   memory<dfloat> s) {
+  const int Npoints = x.length();
+
   for (int n=0;n<Npoints;n++) {
-    dfloat L1 = (sqrt(3.0)*y[n]+1.0)/3.0;
-    dfloat L2 = (-3.0*x[n] - sqrt(3.0)*y[n] + 2.0)/6.0;
-    dfloat L3 = ( 3.0*x[n] - sqrt(3.0)*y[n] + 2.0)/6.0;
+    const dfloat L1 = (sqrt(3.0)*y[n]+1.0)/3.0;
+    const dfloat L2 = (-3.0*x[n] - sqrt(3.0)*y[n] + 2.0)/6.0;
+    const dfloat L3 = ( 3.0*x[n] - sqrt(3.0)*y[n] + 2.0)/6.0;
 
     r[n] = -L2 + L3 - L1; s[n] = -L2 - L3 + L1;
   }
 }
 
-void mesh_t::WarpBlendTransformTri2D(int _N, int _Npoints, dfloat *_r, dfloat *_s, dfloat alphaIn){
+void mesh_t::WarpBlendTransformTri2D(const int _N,
+                                     memory<dfloat> _r,
+                                     memory<dfloat> _s,
+                                     const dfloat alphaIn){
 
   const dfloat alpopt[15] = {0.0000, 0.0000, 1.4152, 0.1001, 0.2751, 0.9800, 1.0999,
                              1.2832, 1.3648, 1.4773, 1.4959, 1.5743, 1.5770, 1.6223, 1.6258};
 
+  const int _Npoints = _r.length();
+
   dfloat alpha;
   if (alphaIn==-1) {
     if (_N<16) {
@@ -740,16 +936,16 @@ void mesh_t::WarpBlendTransformTri2D(int _N, int _Npoints, dfloat *_r, dfloat *_
   }
 
   // Convert r s coordinates to points in equilateral triangle
-  dfloat *L1 = (dfloat*) malloc(_Npoints*sizeof(dfloat));
-  dfloat *L2 = (dfloat*) malloc(_Npoints*sizeof(dfloat));
-  dfloat *L3 = (dfloat*) malloc(_Npoints*sizeof(dfloat));
+  memory<dfloat> L1(_Npoints);
+  memory<dfloat> L2(_Npoints);
+  memory<dfloat> L3(_Npoints);
 
-  dfloat *dL32 = (dfloat*) malloc(_Npoints*sizeof(dfloat));
-  dfloat *dL13 = (dfloat*) malloc(_Npoints*sizeof(dfloat));
-  dfloat *dL21 = (dfloat*) malloc(_Npoints*sizeof(dfloat));
+  memory<dfloat> dL32(_Npoints);
+  memory<dfloat> dL13(_Npoints);
+  memory<dfloat> dL21(_Npoints);
 
-  dfloat *_x = (dfloat*) malloc(_Npoints*sizeof(dfloat));
-  dfloat *_y = (dfloat*) malloc(_Npoints*sizeof(dfloat));
+  memory<dfloat> _x(_Npoints);
+  memory<dfloat> _y(_Npoints);
 
   for (int n=0;n<_Npoints;n++) {
     L1[n] =  0.5*(1.+_s[n]);
@@ -763,13 +959,13 @@ void mesh_t::WarpBlendTransformTri2D(int _N, int _Npoints, dfloat *_r, dfloat *_
     _x[n] = -L2[n]+L3[n]; _y[n] = (-L2[n]-L3[n]+2.*L1[n])/sqrt(3.0);
   }
 
-  dfloat *warpf1 = (dfloat*) malloc(_Npoints*sizeof(dfloat));
-  dfloat *warpf2 = (dfloat*) malloc(_Npoints*sizeof(dfloat));
-  dfloat *warpf3 = (dfloat*) malloc(_Npoints*sizeof(dfloat));
+  memory<dfloat> warpf1(_Npoints);
+  memory<dfloat> warpf2(_Npoints);
+  memory<dfloat> warpf3(_Npoints);
 
-  Warpfactor(_N, _Npoints, dL32, warpf1);
-  Warpfactor(_N, _Npoints, dL13, warpf2);
-  Warpfactor(_N, _Npoints, dL21, warpf3);
+  Warpfactor(_N, dL32, warpf1);
+  Warpfactor(_N, dL13, warpf2);
+  Warpfactor(_N, dL21, warpf3);
 
   for (int n=0;n<_Npoints;n++) {
     dfloat blend1 = 4.0*L2[n]*L3[n];
@@ -784,12 +980,7 @@ void mesh_t::WarpBlendTransformTri2D(int _N, int _Npoints, dfloat *_r, dfloat *_
     _y[n] += 0.*warp1 + sin(2.*M_PI/3.)*warp2 + sin(4.*M_PI/3.)*warp3;
   }
 
-  xytors(_Npoints, _x, _y, _r, _s);
-
-  free(L1); free(L2); free(L3);
-  free(dL32); free(dL21); free(dL13);
-  free(warpf1); free(warpf2); free(warpf3);
-  free(_x); free(_y);
+  xytors(_x, _y, _r, _s);
 }
 
 // ------------------------------------------------------------------------
@@ -998,18 +1189,20 @@ static const dfloat cubTriR50[453] = {-4.872882732304178e-01,-2.542345353916386e
 static const dfloat cubTriS50[453] = {-4.872882732304183e-01,-4.872882732304183e-01,-2.542345353916348e-02,-9.981550877878594e-01,-1.133538262833160e-01, 1.115089140711756e-01,-9.981550877878594e-01, 1.115089140711755e-01,-1.133538262833150e-01,-9.551240174753737e-01,-9.481262122484194e-01, 9.032502297237955e-01,-9.551240174753737e-01, 9.032502297237955e-01,-9.481262122484182e-01,-5.544185434125337e-01,-5.197117894734606e-01, 7.413033288599558e-02,-5.544185434125337e-01, 7.413033288599546e-02,-5.197117894734606e-01,-9.537595720595068e-01,-7.308947639467150e-01, 6.846543360062232e-01,-9.537595720595068e-01, 6.846543360062232e-01,-7.308947639467139e-01,-9.982864404099823e-01,-9.099288550653375e-01, 9.082152954753198e-01,-9.982864404099823e-01, 9.082152954753198e-01,-9.099288550653363e-01,-9.903827246782556e-01,-1.825708668644565e-01, 1.729535915427133e-01,-9.903827246782556e-01, 1.729535915427121e-01,-1.825708668644553e-01,-9.530139752811234e-01,-6.570730845200339e-01, 6.100870598011572e-01,-9.530139752811234e-01, 6.100870598011572e-01,-6.570730845200325e-01,-9.967388437157084e-01,-9.967388437157074e-01, 9.934776874314213e-01,-8.132067057412727e-01,-5.797881505899334e-01, 3.929948563312072e-01,-8.132067057412727e-01, 3.929948563312072e-01,-5.797881505899334e-01,-9.164067266977042e-01,-7.214063597628300e-01, 6.378130864605343e-01,-9.164067266977042e-01, 6.378130864605343e-01,-7.214063597628289e-01,-9.012853762142768e-01,-8.592358921744816e-01, 7.605212683887597e-01,-9.012853762142771e-01, 7.605212683887597e-01,-8.592358921744805e-01,-9.613074769938050e-01,-1.934626150309704e-02,-1.934626150309704e-02,-6.943717859756775e-01,-2.547870236870117e-01,-5.084119033730944e-02,-6.943717859756775e-01,-5.084119033730944e-02,-2.547870236870114e-01,-8.697691401387569e-01,-7.041685654447123e-01, 5.739377055834691e-01,-8.697691401387569e-01, 5.739377055834691e-01,-7.041685654447112e-01,-9.530757136479578e-01,-5.719507128367128e-01, 5.250264264846709e-01,-9.530757136479578e-01, 5.250264264846709e-01,-5.719507128367118e-01,-8.959354435311784e-01,-3.394651938837119e-01, 2.354006374148909e-01,-8.959354435311784e-01, 2.354006374148909e-01,-3.394651938837116e-01,-9.970109617889348e-01,-2.600302729111919e-01, 2.570412347001281e-01,-9.970109617889348e-01, 2.570412347001280e-01,-2.600302729111914e-01,-9.759125061304149e-01,-8.649768619133283e-01, 8.408893680437466e-01,-9.759125061304149e-01, 8.408893680437466e-01,-8.649768619133272e-01,-8.866304289130773e-01,-1.258023176398068e-01, 1.243274655288423e-02,-8.866304289130773e-01, 1.243274655288418e-02,-1.258023176398056e-01,-6.903486740300824e-01,-1.548256629849583e-01,-1.548256629849583e-01,-7.702797159103688e-01,-1.709755576844286e-01,-5.874472640520140e-02,-7.702797159103689e-01,-5.874472640520140e-02,-1.709755576844287e-01,-9.374703276265821e-01,-2.752389754621450e-01, 2.127093030887279e-01,-9.374703276265821e-01, 2.127093030887279e-01,-2.752389754621447e-01,-7.955067197089168e-01,-3.754974289775928e-01, 1.710041486865101e-01,-7.955067197089168e-01, 1.710041486865101e-01,-3.754974289775926e-01,-9.223027057361204e-01,-3.884864713193931e-02,-3.884864713193931e-02,-9.033013962538536e-01,-2.146213565926236e-01, 1.179227528464785e-01,-9.033013962538536e-01, 1.179227528464785e-01,-2.146213565926236e-01,-9.800408461407152e-01,-9.478722916754247e-01, 9.279131378161389e-01,-9.800408461407152e-01, 9.279131378161389e-01,-9.478722916754235e-01,-8.321741184601432e-01,-2.032180083100886e-01, 3.539212677023307e-02,-8.321741184601432e-01, 3.539212677023307e-02,-2.032180083100886e-01,-9.865437365974319e-01,-9.082392173581701e-01, 8.947829539556066e-01,-9.865437365974319e-01, 8.947829539556066e-01,-9.082392173581691e-01,-6.453839503352626e-01,-4.978033842696664e-01, 1.431873346049300e-01,-6.453839503352626e-01, 1.431873346049300e-01,-4.978033842696664e-01,-9.146318245298287e-01,-6.337261321307773e-01, 5.483579566606072e-01,-9.146318245298287e-01, 5.483579566606072e-01,-6.337261321307773e-01,-8.512144639199243e-01,-2.919073934432752e-01, 1.431218573632000e-01,-8.512144639199243e-01, 1.431218573632000e-01,-2.919073934432750e-01,-5.128621044059271e-01,-4.015780257796654e-01,-8.555986981440711e-02,-5.128621044059271e-01,-8.555986981440711e-02,-4.015780257796653e-01,-8.472168605080239e-01,-4.183215340291764e-01, 2.655383945372011e-01,-8.472168605080239e-01, 2.655383945372011e-01,-4.183215340291763e-01,-7.515508206622273e-01,-6.277077362933863e-01, 3.792585569556139e-01,-7.515508206622273e-01, 3.792585569556139e-01,-6.277077362933852e-01,-9.804419832588688e-01,-7.327456500575513e-01, 7.131876333164204e-01,-9.804419832588688e-01, 7.131876333164204e-01,-7.327456500575501e-01,-9.441135959036719e-01,-8.620473297035763e-01, 8.061609256072495e-01,-9.441135959036719e-01, 8.061609256072495e-01,-8.620473297035763e-01,-9.503834629782960e-01,-4.770179614603841e-01, 4.274014244386812e-01,-9.503834629782960e-01, 4.274014244386800e-01,-4.770179614603829e-01,-9.577479991442321e-01,-9.120410878817669e-01, 8.697890870259968e-01,-9.577479991442321e-01, 8.697890870259968e-01,-9.120410878817656e-01,-8.637002594919088e-01,-5.060855365853788e-01, 3.697857960772886e-01,-8.637002594919088e-01, 3.697857960772886e-01,-5.060855365853777e-01,-8.624093861991303e-01,-6.167853940403172e-01, 4.791947802394489e-01,-8.624093861991303e-01, 4.791947802394489e-01,-6.167853940403172e-01,-6.872930457599575e-01,-3.652080353829373e-01, 5.250108114289476e-02,-6.872930457599575e-01, 5.250108114289476e-02,-3.652080353829372e-01,-9.684224577286503e-01,-2.123213874003148e-01, 1.807438451289652e-01,-9.684224577286503e-01, 1.807438451289652e-01,-2.123213874003138e-01,-9.531071840655393e-01,-3.637106635744283e-01, 3.168178476399691e-01,-9.531071840655393e-01, 3.168178476399681e-01,-3.637106635744282e-01,-7.281403730572283e-01,-4.392710248840062e-01, 1.674113979412348e-01,-7.281403730572283e-01, 1.674113979412347e-01,-4.392710248840059e-01,-9.106088059739158e-01,-5.400485398316867e-01, 4.506573458056035e-01,-9.106088059739158e-01, 4.506573458056035e-01,-5.400485398316867e-01,-5.078874986216810e-01,-3.010237499257296e-01,-1.910887514525894e-01,-5.078874986216810e-01,-1.910887514525894e-01,-3.010237499257296e-01,-9.784730545245666e-01,-8.071832841875552e-01, 7.856563387121220e-01,-9.784730545245666e-01, 7.856563387121220e-01,-8.071832841875541e-01,-9.464044605480616e-01,-7.993239840668014e-01, 7.457284446148619e-01,-9.464044605480616e-01, 7.457284446148619e-01,-7.993239840668002e-01,-6.001966867209210e-01,-4.211243138320420e-01, 2.132100055296415e-02,-6.001966867209210e-01, 2.132100055296426e-02,-4.211243138320419e-01,-8.383470872976471e-01,-8.082645635117647e-02,-8.082645635117519e-02,-9.806804143047906e-01,-6.427670920898165e-01, 6.234475063946073e-01,-9.806804143047906e-01, 6.234475063946073e-01,-6.427670920898154e-01,-7.733335317298518e-01,-7.733335317298506e-01, 5.466670634597035e-01,-9.127674219487553e-01,-4.276921547374961e-01, 3.404595766862517e-01,-9.127674219487553e-01, 3.404595766862516e-01,-4.276921547374953e-01,-9.807324089004609e-01,-5.413422080782049e-01, 5.220746169786655e-01,-9.807324089004609e-01, 5.220746169786655e-01,-5.413422080782035e-01,-7.670859956964946e-01,-2.915742076670567e-01, 5.866020336355282e-02,-7.670859956964946e-01, 5.866020336355166e-02,-2.915742076670563e-01,-9.810462457017906e-01,-3.101110169100291e-01, 2.911572626118211e-01,-9.810462457017906e-01, 2.911572626118211e-01,-3.101110169100285e-01,-9.440667674080058e-01,-1.366535383528923e-01, 8.072030576089800e-02,-9.440667674080058e-01, 8.072030576089800e-02,-1.366535383528912e-01,-7.054683116064060e-01,-5.584055524963210e-01, 2.638738641027268e-01,-7.054683116064060e-01, 2.638738641027268e-01,-5.584055524963196e-01,-9.781219738706742e-01,-9.781219738706731e-01, 9.562439477413485e-01,-9.962588661779311e-01,-6.936543298046310e-01, 6.899131959825635e-01,-9.962588661779311e-01, 6.899131959825635e-01,-6.936543298046310e-01,-8.427072760972010e-01,-8.427072760971998e-01, 6.854145521944008e-01,-9.958657196429472e-01,-9.816027664683789e-01, 9.774684861113283e-01,-9.958657196429472e-01, 9.774684861113283e-01,-9.816027664683776e-01,-9.959872383159527e-01,-9.529095314851794e-01, 9.488967698011345e-01,-9.959872383159527e-01, 9.488967698011345e-01,-9.529095314851782e-01,-6.068160154409350e-01,-3.085430405630674e-01,-8.464094399599648e-02,-6.068160154409350e-01,-8.464094399599648e-02,-3.085430405630671e-01,-7.850422677955388e-01,-4.996613823756856e-01, 2.847036501712243e-01,-7.850422677955388e-01, 2.847036501712243e-01,-4.996613823756844e-01,-9.962763812934690e-01,-4.889002747967343e-01, 4.851766560902033e-01,-9.962763812934691e-01, 4.851766560902033e-01,-4.889002747967332e-01,-8.995854148168808e-01,-7.934131294747505e-01, 6.929985442916313e-01,-8.995854148168808e-01, 6.929985442916313e-01,-7.934131294747493e-01,-6.594822352717378e-01,-6.594822352717367e-01, 3.189644705434744e-01,-8.418024664862509e-01,-7.730344416189197e-01, 6.148369081051707e-01,-8.418024664862509e-01, 6.148369081051707e-01,-7.730344416189197e-01,-9.963422606399693e-01,-5.959969617208548e-01, 5.923392223608240e-01,-9.963422606399693e-01, 5.923392223608240e-01,-5.959969617208537e-01,-6.007330813263658e-01,-6.007330813263658e-01, 2.014661626527315e-01,-8.050030786501207e-01,-6.891854133166682e-01, 4.941884919667888e-01,-8.050030786501207e-01, 4.941884919667888e-01,-6.891854133166671e-01,-9.794579145241266e-01,-4.325337483001823e-01, 4.119916628243097e-01,-9.794579145241266e-01, 4.119916628243097e-01,-4.325337483001819e-01,-6.026757301953206e-01,-1.986621349023401e-01,-1.986621349023392e-01,-7.187722518872315e-01,-7.187722518872315e-01, 4.375445037744630e-01,-9.961098664509530e-01,-7.797247785760779e-01, 7.758346450270308e-01,-9.961098664509530e-01, 7.758346450270308e-01,-7.797247785760768e-01,-4.047614577816621e-01,-2.976192711091686e-01,-2.976192711091686e-01,-9.154992749697238e-01,-9.154992749697237e-01, 8.309985499394521e-01,-9.959745701414753e-01,-3.767460272074153e-01, 3.727205973488916e-01,-9.959745701414753e-01, 3.727205973488916e-01,-3.767460272074148e-01,-9.947355932482315e-01,-2.632203375884212e-03,-2.632203375883158e-03,-9.802403515707592e-01,-8.786874490008606e-02, 6.810909647084645e-02,-9.802403515707592e-01, 6.810909647084645e-02,-8.786874490008606e-02,-4.070402649740579e-01,-4.070402649740577e-01,-1.859194700518831e-01,-9.955476949059794e-01,-8.518818506402979e-01, 8.474295455462760e-01,-9.955476949059794e-01, 8.474295455462760e-01,-8.518818506402979e-01};
 static const dfloat cubTriW50[453] = { 6.657390349426455e-03, 6.657390349426455e-03, 6.657390349426455e-03, 6.946563189849545e-04, 6.946563189849545e-04, 6.946563189849545e-04, 6.946563189849545e-04, 6.946563189849545e-04, 6.946563189849545e-04, 6.001002038198160e-04, 6.001002038198160e-04, 6.001002038198160e-04, 6.001002038198160e-04, 6.001002038198160e-04, 6.001002038198160e-04, 5.460804118459578e-03, 5.460804118459578e-03, 5.460804118459578e-03, 5.460804118459578e-03, 5.460804118459578e-03, 5.460804118459578e-03, 2.265942903569247e-03, 2.265942903569247e-03, 2.265942903569247e-03, 2.265942903569247e-03, 2.265942903569247e-03, 2.265942903569247e-03, 2.730436819934103e-04, 2.730436819934103e-04, 2.730436819934103e-04, 2.730436819934103e-04, 2.730436819934103e-04, 2.730436819934103e-04, 1.538752133568642e-03, 1.538752133568642e-03, 1.538752133568642e-03, 1.538752133568642e-03, 1.538752133568642e-03, 1.538752133568642e-03, 2.593946389224276e-03, 2.593946389224276e-03, 2.593946389224276e-03, 2.593946389224276e-03, 2.593946389224276e-03, 2.593946389224276e-03, 7.318644849273957e-05, 7.318644849273957e-05, 7.318644849273957e-05, 5.611710324220868e-03, 5.611710324220868e-03, 5.611710324220868e-03, 5.611710324220868e-03, 5.611710324220868e-03, 5.611710324220868e-03, 3.417648445986814e-03, 3.417648445986814e-03, 3.417648445986814e-03, 3.417648445986814e-03, 3.417648445986814e-03, 3.417648445986814e-03, 2.967053886437357e-03, 2.967053886437357e-03, 2.967053886437357e-03, 2.967053886437357e-03, 2.967053886437357e-03, 2.967053886437357e-03, 3.302792714558273e-03, 3.302792714558273e-03, 3.302792714558273e-03, 8.508388866481208e-03, 8.508388866481208e-03, 8.508388866481208e-03, 8.508388866481208e-03, 8.508388866481208e-03, 8.508388866481208e-03, 4.395599173436755e-03, 4.395599173436755e-03, 4.395599173436755e-03, 4.395599173436755e-03, 4.395599173436755e-03, 4.395599173436755e-03, 3.097163895783280e-03, 3.097163895783280e-03, 3.097163895783280e-03, 3.097163895783280e-03, 3.097163895783280e-03, 3.097163895783280e-03, 5.206561060307146e-03, 5.206561060307146e-03, 5.206561060307146e-03, 5.206561060307146e-03, 5.206561060307146e-03, 5.206561060307146e-03, 9.323405282065094e-04, 9.323405282065094e-04, 9.323405282065094e-04, 9.323405282065094e-04, 9.323405282065094e-04, 9.323405282065094e-04, 1.315154312747027e-03, 1.315154312747027e-03, 1.315154312747027e-03, 1.315154312747027e-03, 1.315154312747027e-03, 1.315154312747027e-03, 5.572930818998745e-03, 5.572930818998745e-03, 5.572930818998745e-03, 5.572930818998745e-03, 5.572930818998745e-03, 5.572930818998745e-03, 8.518193356382943e-03, 8.518193356382943e-03, 8.518193356382943e-03, 8.259313906283377e-03, 8.259313906283377e-03, 8.259313906283377e-03, 8.259313906283377e-03, 8.259313906283377e-03, 8.259313906283377e-03, 4.308565834654927e-03, 4.308565834654927e-03, 4.308565834654927e-03, 4.308565834654927e-03, 4.308565834654927e-03, 4.308565834654927e-03, 6.951904403342183e-03, 6.951904403342183e-03, 6.951904403342183e-03, 6.951904403342183e-03, 6.951904403342183e-03, 6.951904403342183e-03, 4.848559914819788e-03, 4.848559914819788e-03, 4.848559914819788e-03, 5.413228508794187e-03, 5.413228508794187e-03, 5.413228508794187e-03, 5.413228508794187e-03, 5.413228508794187e-03, 5.413228508794187e-03, 7.715585739642544e-04, 7.715585739642544e-04, 7.715585739642544e-04, 7.715585739642544e-04, 7.715585739642544e-04, 7.715585739642544e-04, 6.879332858451239e-03, 6.879332858451239e-03, 6.879332858451239e-03, 6.879332858451239e-03, 6.879332858451239e-03, 6.879332858451239e-03, 8.760538741807422e-04, 8.760538741807422e-04, 8.760538741807422e-04, 8.760538741807422e-04, 8.760538741807422e-04, 8.760538741807422e-04, 8.812029055620537e-03, 8.812029055620537e-03, 8.812029055620537e-03, 8.812029055620537e-03, 8.812029055620537e-03, 8.812029055620537e-03, 4.035001616910756e-03, 4.035001616910756e-03, 4.035001616910756e-03, 4.035001616910756e-03, 4.035001616910756e-03, 4.035001616910756e-03, 6.476557209099314e-03, 6.476557209099314e-03, 6.476557209099314e-03, 6.476557209099314e-03, 6.476557209099314e-03, 6.476557209099314e-03, 9.332421598387951e-03, 9.332421598387951e-03, 9.332421598387951e-03, 9.332421598387951e-03, 9.332421598387951e-03, 9.332421598387951e-03, 6.318451635509753e-03, 6.318451635509753e-03, 6.318451635509753e-03, 6.318451635509753e-03, 6.318451635509753e-03, 6.318451635509753e-03, 6.947169036892493e-03, 6.947169036892493e-03, 6.947169036892493e-03, 6.947169036892493e-03, 6.947169036892493e-03, 6.947169036892493e-03, 1.790785760819592e-03, 1.790785760819592e-03, 1.790785760819592e-03, 1.790785760819592e-03, 1.790785760819592e-03, 1.790785760819592e-03, 2.160956967250754e-03, 2.160956967250754e-03, 2.160956967250754e-03, 2.160956967250754e-03, 2.160956967250754e-03, 2.160956967250754e-03, 3.625948898219889e-03, 3.625948898219889e-03, 3.625948898219889e-03, 3.625948898219889e-03, 3.625948898219889e-03, 3.625948898219889e-03, 1.530314260972668e-03, 1.530314260972668e-03, 1.530314260972668e-03, 1.530314260972668e-03, 1.530314260972668e-03, 1.530314260972668e-03, 5.715363408558084e-03, 5.715363408558084e-03, 5.715363408558084e-03, 5.715363408558084e-03, 5.715363408558084e-03, 5.715363408558084e-03, 5.311624181103388e-03, 5.311624181103388e-03, 5.311624181103388e-03, 5.311624181103388e-03, 5.311624181103388e-03, 5.311624181103388e-03, 8.954875242212912e-03, 8.954875242212912e-03, 8.954875242212912e-03, 8.954875242212912e-03, 8.954875242212912e-03, 8.954875242212912e-03, 3.307034522546869e-03, 3.307034522546869e-03, 3.307034522546869e-03, 3.307034522546869e-03, 3.307034522546869e-03, 3.307034522546869e-03, 3.889979271429086e-03, 3.889979271429086e-03, 3.889979271429086e-03, 3.889979271429086e-03, 3.889979271429086e-03, 3.889979271429086e-03, 8.109035161435074e-03, 8.109035161435074e-03, 8.109035161435074e-03, 8.109035161435074e-03, 8.109035161435074e-03, 8.109035161435074e-03, 4.694394019735870e-03, 4.694394019735870e-03, 4.694394019735870e-03, 4.694394019735870e-03, 4.694394019735870e-03, 4.694394019735870e-03, 1.083959336378307e-02, 1.083959336378307e-02, 1.083959336378307e-02, 1.083959336378307e-02, 1.083959336378307e-02, 1.083959336378307e-02, 1.642838231112343e-03, 1.642838231112343e-03, 1.642838231112343e-03, 1.642838231112343e-03, 1.642838231112343e-03, 1.642838231112343e-03, 2.666246149563934e-03, 2.666246149563934e-03, 2.666246149563934e-03, 2.666246149563934e-03, 2.666246149563934e-03, 2.666246149563934e-03, 9.345204607681365e-03, 9.345204607681365e-03, 9.345204607681365e-03, 9.345204607681365e-03, 9.345204607681365e-03, 9.345204607681365e-03, 7.165422422124212e-03, 7.165422422124212e-03, 7.165422422124212e-03, 2.094698488702054e-03, 2.094698488702054e-03, 2.094698488702054e-03, 2.094698488702054e-03, 2.094698488702054e-03, 2.094698488702054e-03, 5.559197600889325e-03, 5.559197600889325e-03, 5.559197600889325e-03, 5.114025641788271e-03, 5.114025641788271e-03, 5.114025641788271e-03, 5.114025641788271e-03, 5.114025641788271e-03, 5.114025641788271e-03, 2.307531856262896e-03, 2.307531856262896e-03, 2.307531856262896e-03, 2.307531856262896e-03, 2.307531856262896e-03, 2.307531856262896e-03, 8.447515166178116e-03, 8.447515166178116e-03, 8.447515166178116e-03, 8.447515166178116e-03, 8.447515166178116e-03, 8.447515166178116e-03, 2.674835029006195e-03, 2.674835029006195e-03, 2.674835029006195e-03, 2.674835029006195e-03, 2.674835029006195e-03, 2.674835029006195e-03, 4.762846609148414e-03, 4.762846609148414e-03, 4.762846609148414e-03, 4.762846609148414e-03, 4.762846609148414e-03, 4.762846609148414e-03, 8.565704904038088e-03, 8.565704904038088e-03, 8.565704904038088e-03, 8.565704904038088e-03, 8.565704904038088e-03, 8.565704904038088e-03, 5.999043738296138e-04, 5.999043738296138e-04, 5.999043738296138e-04, 8.813879170752840e-04, 8.813879170752840e-04, 8.813879170752840e-04, 8.813879170752840e-04, 8.813879170752840e-04, 8.813879170752840e-04, 4.135743334159545e-03, 4.135743334159545e-03, 4.135743334159545e-03, 2.312905155015999e-04, 2.312905155015999e-04, 2.312905155015999e-04, 2.312905155015999e-04, 2.312905155015999e-04, 2.312905155015999e-04, 3.648502036079644e-04, 3.648502036079644e-04, 3.648502036079644e-04, 3.648502036079644e-04, 3.648502036079644e-04, 3.648502036079644e-04, 1.027023169453754e-02, 1.027023169453754e-02, 1.027023169453754e-02, 1.027023169453754e-02, 1.027023169453754e-02, 1.027023169453754e-02, 7.810111420309305e-03, 7.810111420309305e-03, 7.810111420309305e-03, 7.810111420309305e-03, 7.810111420309305e-03, 7.810111420309305e-03, 1.056435119667814e-03, 1.056435119667814e-03, 1.056435119667814e-03, 1.056435119667814e-03, 1.056435119667814e-03, 1.056435119667814e-03, 3.798126951694121e-03, 3.798126951694121e-03, 3.798126951694121e-03, 3.798126951694121e-03, 3.798126951694121e-03, 3.798126951694121e-03, 8.055421977036622e-03, 8.055421977036622e-03, 8.055421977036622e-03, 4.668118624327395e-03, 4.668118624327395e-03, 4.668118624327395e-03, 4.668118624327395e-03, 4.668118624327395e-03, 4.668118624327395e-03, 9.653413559593795e-04, 9.653413559593795e-04, 9.653413559593795e-04, 9.653413559593795e-04, 9.653413559593795e-04, 9.653413559593795e-04, 9.141347778252359e-03, 9.141347778252359e-03, 9.141347778252359e-03, 6.498038898642801e-03, 6.498038898642801e-03, 6.498038898642801e-03, 6.498038898642801e-03, 6.498038898642801e-03, 6.498038898642801e-03, 2.639533833535306e-03, 2.639533833535306e-03, 2.639533833535306e-03, 2.639533833535306e-03, 2.639533833535306e-03, 2.639533833535306e-03, 1.033071185490775e-02, 1.033071185490775e-02, 1.033071185490775e-02, 6.970526706905318e-03, 6.970526706905318e-03, 6.970526706905318e-03, 7.934585220888267e-04, 7.934585220888267e-04, 7.934585220888267e-04, 7.934585220888267e-04, 7.934585220888267e-04, 7.934585220888267e-04, 1.171362890783687e-02, 1.171362890783687e-02, 1.171362890783687e-02, 2.397897567676900e-03, 2.397897567676900e-03, 2.397897567676900e-03, 1.171186882042265e-03, 1.171186882042265e-03, 1.171186882042265e-03, 1.171186882042265e-03, 1.171186882042265e-03, 1.171186882042265e-03, 1.498644108210421e-03, 1.498644108210421e-03, 1.498644108210421e-03, 2.801549769502369e-03, 2.801549769502369e-03, 2.801549769502369e-03, 2.801549769502369e-03, 2.801549769502369e-03, 2.801549769502369e-03, 1.154727780718457e-02, 1.154727780718457e-02, 1.154727780718457e-02, 7.410110976283445e-04, 7.410110976283445e-04, 7.410110976283445e-04, 7.410110976283445e-04, 7.410110976283445e-04, 7.410110976283445e-04};
 
-void mesh_t::CubatureNodesTri2D(int cubTriN, int *_cubNp, dfloat **cubTrir, dfloat **cubTris, dfloat **cubTriw){
+void mesh_t::CubatureNodesTri2D(const int cubTriN,
+                                int& _cubNp,
+                                memory<dfloat>& cubTrir,
+                                memory<dfloat>& cubTris,
+                                memory<dfloat>& cubTriw){
 
-  if (cubTriN>50)
-    LIBP_ABORT(string("Requested Cubature order unavailable."))
+  LIBP_ABORT("Requested Cubature order unavailable.",
+             cubTriN>50);
 
-  int cubTriNp = cubTriNps[cubTriN-1];
+  _cubNp = cubTriNps[cubTriN-1];
 
-  *_cubNp = cubTriNp;
-
-  *cubTrir = (dfloat*) calloc(cubTriNp, sizeof(dfloat));
-  *cubTris = (dfloat*) calloc(cubTriNp, sizeof(dfloat));
-  *cubTriw = (dfloat*) calloc(cubTriNp, sizeof(dfloat));
+  cubTrir.malloc(_cubNp);
+  cubTris.malloc(_cubNp);
+  cubTriw.malloc(_cubNp);
 
   const dfloat *cubTriR, *cubTriS, *cubTriW;
   switch(cubTriN){
@@ -1064,12 +1257,14 @@ void mesh_t::CubatureNodesTri2D(int cubTriN, int *_cubNp, dfloat **cubTrir, dflo
     case 49: cubTriR = cubTriR49; cubTriS = cubTriS49; cubTriW = cubTriW49; break;
     case 50: cubTriR = cubTriR50; cubTriS = cubTriS50; cubTriW = cubTriW50; break;
     default:
-      LIBP_ABORT(string("Requested Cubature order unavailable."))
+      LIBP_FORCE_ABORT("Requested Cubature order unavailable.");
   }
 
-  for(int n=0;n<cubTriNp;++n){
-    cubTrir[0][n] = cubTriR[n];
-    cubTris[0][n] = cubTriS[n];
-    cubTriw[0][n] = cubTriW[n];
+  for(int n=0;n<_cubNp;++n){
+    cubTrir[n] = cubTriR[n];
+    cubTris[n] = cubTriS[n];
+    cubTriw[n] = cubTriW[n];
   }
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshConnect.cpp b/libs/mesh/meshConnect.cpp
index 7d6d65ed3..f2f0b4155 100644
--- a/libs/mesh/meshConnect.cpp
+++ b/libs/mesh/meshConnect.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,100 +26,269 @@ SOFTWARE.
 
 #include "mesh.hpp"
 
+#ifdef GLIBCXX_PARALLEL
+#include <parallel/algorithm>
+using __gnu_parallel::sort;
+#else
+using std::sort;
+#endif
+
+namespace libp {
+
 // structure used to encode vertices that make
 // each face, the element/face indices, and
 // the neighbor element/face indices (if any)
-typedef struct{
-
-  dlong element;
-  int face;
-
-  dlong elementNeighbor; // neighbor element
-  int faceNeighbor;    // neighbor face
-
-  hlong v[4];
+typedef struct {
+  hlong v[4]; // vertices on face
+  dlong element, elementN;
+  int face, faceN;    // face info
+  int rank, rankN; // N for neighbor face info
 
 }face_t;
 
 
-/* routine to find EToE (Element To Element)
-   and EToF (Element To Local Face) connectivity arrays */
+// mesh is the local partition
 void mesh_t::Connect(){
 
+  EToE.malloc(Nelements*Nfaces);
+  EToF.malloc(Nelements*Nfaces);
+  EToP.malloc(Nelements*Nfaces);
+
+  /**********************
+   * Local Connectivity
+   **********************/
+
   /* build list of faces */
-  face_t *faces =
-    (face_t*) calloc(Nelements*Nfaces, sizeof(face_t));
+  memory<face_t> faces(Nelements*Nfaces);
 
-  dlong cnt = 0;
+  #pragma omp parallel for collapse(2)
   for(dlong e=0;e<Nelements;++e){
     for(int f=0;f<Nfaces;++f){
 
+      const dlong id = f + e*Nfaces;
+
       for(int n=0;n<NfaceVertices;++n){
         dlong vid = e*Nverts + faceVertices[f*NfaceVertices+n];
-        faces[cnt].v[n] = EToV[vid];
+        faces[id].v[n] = EToV[vid];
       }
 
-      std::sort(faces[cnt].v, faces[cnt].v+NfaceVertices,
+      std::sort(faces[id].v, faces[id].v+NfaceVertices,
                 std::less<hlong>());
 
-      faces[cnt].element = e;
-      faces[cnt].face = f;
-
-      faces[cnt].elementNeighbor= -1;
-      faces[cnt].faceNeighbor = -1;
+      faces[id].element = e;
+      faces[id].face = f;
 
-      ++cnt;
+      faces[id].elementN= -1;
+      faces[id].faceN = -1;
     }
   }
 
   /* sort faces by their vertex number pairs */
-  std::sort(faces, faces+Nelements*Nfaces,
-            [&](const face_t& a, const face_t& b) {
-              return std::lexicographical_compare(a.v, a.v+NfaceVertices,
-                                                  b.v, b.v+NfaceVertices);
-            });
+  sort(faces.ptr(), faces.ptr()+Nelements*Nfaces,
+       [&](const face_t& a, const face_t& b) {
+         return std::lexicographical_compare(a.v, a.v+NfaceVertices,
+                                             b.v, b.v+NfaceVertices);
+       });
 
   /* scan through sorted face lists looking for adjacent
      faces that have the same vertex ids */
-  for(cnt=0;cnt<Nelements*Nfaces-1;++cnt){
+  #pragma omp parallel for
+  for(dlong cnt=0;cnt<Nelements*Nfaces-1;++cnt){
     if(std::equal(faces[cnt].v, faces[cnt].v+NfaceVertices,
                   faces[cnt+1].v)){
       // match
-      faces[cnt].elementNeighbor = faces[cnt+1].element;
-      faces[cnt].faceNeighbor = faces[cnt+1].face;
+      faces[cnt].elementN = faces[cnt+1].element;
+      faces[cnt].faceN = faces[cnt+1].face;
 
-      faces[cnt+1].elementNeighbor = faces[cnt].element;
-      faces[cnt+1].faceNeighbor = faces[cnt].face;
+      faces[cnt+1].elementN = faces[cnt].element;
+      faces[cnt+1].faceN = faces[cnt].face;
     }
   }
 
   /* resort faces back to the original element/face ordering */
-  std::sort(faces, faces+Nelements*Nfaces,
-            [](const face_t& a, const face_t& b) {
-              if(a.element < b.element) return true;
-              if(a.element > b.element) return false;
+  sort(faces.ptr(), faces.ptr()+Nelements*Nfaces,
+       [](const face_t& a, const face_t& b) {
+         if(a.element < b.element) return true;
+         if(a.element > b.element) return false;
 
-              return (a.face < b.face);
-            });
+         return (a.face < b.face);
+       });
 
   /* extract the element to element and element to face connectivity */
-  EToE = (dlong*) calloc(Nelements*Nfaces, sizeof(dlong));
-  EToF = (int*)   calloc(Nelements*Nfaces, sizeof(int  ));
+  #pragma omp parallel for collapse(2)
+  for(dlong e=0;e<Nelements;++e){
+    for(int f=0;f<Nfaces;++f){
+      const dlong id = f + e*Nfaces;
+
+      EToE[id] = faces[id].elementN;
+      EToF[id] = faces[id].faceN;
+    }
+  }
+  faces.free();
 
-  cnt = 0;
+
+  /*****************************
+   * Interprocess Connectivity
+   *****************************/
+
+  // count # of elements to send to each rank based on
+  // minimum {vertex id % size}
+  memory<int> Nsend(size, 0);
+  memory<int> Nrecv(size, 0);
+  memory<int> sendOffsets(size, 0);
+  memory<int> recvOffsets(size, 0);
+
+  // WARNING: In some corner cases, the number of faces to send may overrun int storage
+  int allNsend = 0;
   for(dlong e=0;e<Nelements;++e){
     for(int f=0;f<Nfaces;++f){
-      EToE[cnt] = faces[cnt].elementNeighbor;
-      EToF[cnt] = faces[cnt].faceNeighbor;
-      //      printf("EToE(%d,%d) = %d \n", e,f, EToE[cnt]);
-      ++cnt;
+      if(EToE[e*Nfaces+f]==-1){
+        // find rank of destination for sorting based on max(face vertices)%size
+        hlong maxv = 0;
+        for(int n=0;n<NfaceVertices;++n){
+          int nid = faceVertices[f*NfaceVertices+n];
+          hlong id = EToV[e*Nverts + nid];
+          maxv = std::max(maxv, id);
+        }
+        int destRank = (int) (maxv%size);
+
+        // increment send size for
+        ++Nsend[destRank];
+        ++allNsend;
+      }
     }
   }
 
-  // dlong Nbcs = 0;
-  // for(dlong e=0;e<Nelements;++e)
-  //   for(int f=0;f<Nfaces;++f)
-  //     if(EToE[e*Nfaces+f]==-1)
-  //       ++Nbcs;
-  //printf("Nelements = %d, Nbcs = %d\n", Nelements, Nbcs);
+  // find send offsets
+  for(int rr=1;rr<size;++rr)
+    sendOffsets[rr] = sendOffsets[rr-1] + Nsend[rr-1];
+
+  // reset counters
+  for(int rr=0;rr<size;++rr)
+    Nsend[rr] = 0;
+
+  // buffer for outgoing data
+  memory<face_t> sendFaces(allNsend);
+
+  // pack face data
+  for(dlong e=0;e<Nelements;++e){
+    for(int f=0;f<Nfaces;++f){
+      if(EToE[e*Nfaces+f]==-1){
+
+        // find rank of destination for sorting based on max(face vertices)%size
+        hlong maxv = 0;
+        for(int n=0;n<NfaceVertices;++n){
+          int nid = faceVertices[f*NfaceVertices+n];
+          hlong id = EToV[e*Nverts + nid];
+          maxv = std::max(maxv, id);
+        }
+        int destRank = (int) (maxv%size);
+
+        // populate face to send out staged in segment of sendFaces array
+        int id = sendOffsets[destRank]+Nsend[destRank];
+
+        sendFaces[id].element = e;
+        sendFaces[id].face = f;
+        for(int n=0;n<NfaceVertices;++n){
+          int nid = faceVertices[f*NfaceVertices+n];
+          sendFaces[id].v[n] = EToV[e*Nverts + nid];
+        }
+
+        std::sort(sendFaces[id].v, sendFaces[id].v+NfaceVertices,
+                  std::less<hlong>());
+
+        sendFaces[id].rank = rank;
+
+        sendFaces[id].elementN = -1;
+        sendFaces[id].faceN = -1;
+        sendFaces[id].rankN = -1;
+
+        ++Nsend[destRank];
+      }
+    }
+  }
+
+  // exchange byte counts
+  comm.Alltoall(Nsend, Nrecv);
+
+  // count incoming faces
+  int allNrecv = 0;
+  for(int rr=0;rr<size;++rr)
+    allNrecv += Nrecv[rr];
+
+  // find offsets for recv data
+  for(int rr=1;rr<size;++rr)
+    recvOffsets[rr] = recvOffsets[rr-1] + Nrecv[rr-1]; // byte offsets
+
+  // buffer for incoming face data
+  memory<face_t> recvFaces(allNrecv);
+
+  // exchange parallel faces
+  comm.Alltoallv(sendFaces, Nsend, sendOffsets,
+                 recvFaces, Nrecv, recvOffsets);
+
+  // local sort allNrecv received faces
+  sort(recvFaces.ptr(), recvFaces.ptr()+allNrecv,
+      [&](const face_t& a, const face_t& b) {
+        return std::lexicographical_compare(a.v, a.v+NfaceVertices,
+                                            b.v, b.v+NfaceVertices);
+      });
+
+  // find matches
+  #pragma omp parallel for
+  for(int n=0;n<allNrecv-1;++n){
+    // since vertices are ordered we just look for pairs
+    if(std::equal(recvFaces[n].v, recvFaces[n].v+NfaceVertices,
+                  recvFaces[n+1].v)){
+      recvFaces[n].elementN = recvFaces[n+1].element;
+      recvFaces[n].faceN = recvFaces[n+1].face;
+      recvFaces[n].rankN = recvFaces[n+1].rank;
+
+      recvFaces[n+1].elementN = recvFaces[n].element;
+      recvFaces[n+1].faceN = recvFaces[n].face;
+      recvFaces[n+1].rankN = recvFaces[n].rank;
+    }
+  }
+
+  // sort back to original ordering
+  sort(recvFaces.ptr(), recvFaces.ptr()+allNrecv,
+      [](const face_t& a, const face_t& b) {
+        if(a.rank < b.rank) return true;
+        if(a.rank > b.rank) return false;
+
+        if(a.element < b.element) return true;
+        if(a.element > b.element) return false;
+
+        return (a.face < b.face);
+      });
+
+  // send faces back from whence they came
+  comm.Alltoallv(recvFaces, Nrecv, recvOffsets,
+                 sendFaces, Nsend, sendOffsets);
+
+  // extract connectivity info
+  #pragma omp parallel for
+  for(dlong n=0;n<Nelements*Nfaces;++n)
+    EToP[n] = -1;
+
+  #pragma omp parallel for
+  for(int n=0;n<allNsend;++n){
+    dlong e = sendFaces[n].element;
+    dlong eN = sendFaces[n].elementN;
+    int f = sendFaces[n].face;
+    int fN = sendFaces[n].faceN;
+    int rN = sendFaces[n].rankN;
+
+    if(e>=0 && f>=0 && eN>=0 && fN>=0){
+      EToE[e*Nfaces+f] = eN;
+      EToF[e*Nfaces+f] = fN;
+      EToP[e*Nfaces+f] = rN;
+    }
+  }
+
+  //record the number of elements in the whole mesh
+  NelementsGlobal = Nelements;
+  comm.Allreduce(NelementsGlobal);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshConnectBoundary.cpp b/libs/mesh/meshConnectBoundary.cpp
index 97207dad7..29bd5c231 100644
--- a/libs/mesh/meshConnectBoundary.cpp
+++ b/libs/mesh/meshConnectBoundary.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,6 +26,15 @@ SOFTWARE.
 
 #include "mesh.hpp"
 
+#ifdef GLIBCXX_PARALLEL
+#include <parallel/algorithm>
+using __gnu_parallel::sort;
+#else
+using std::sort;
+#endif
+
+namespace libp {
+
 // structure used to encode vertices that make
 // each face, the element/face indices, and
 // the neighbor element/face indices (if any)
@@ -59,8 +68,7 @@ void mesh_t::ConnectBoundary(){
 #endif
 
   /* build list of boundary faces */
-  boundaryFace_t *boundaryFaces = (boundaryFace_t*) calloc(bcnt+NboundaryFaces,
-                                                           sizeof(boundaryFace_t));
+  memory<boundaryFace_t> boundaryFaces(bcnt+NboundaryFaces);
 
   bcnt = 0; // reset counter
   for(dlong e=0;e<Nelements;++e){
@@ -113,27 +121,29 @@ void mesh_t::ConnectBoundary(){
 #endif
 
   /* sort boundaryFaces by their vertex number pairs */
-  std::sort(boundaryFaces, boundaryFaces+bcnt,
-            [&](const boundaryFace_t& a, const boundaryFace_t& b) {
-              return std::lexicographical_compare(a.v, a.v+NfaceVertices,
-                                                  b.v, b.v+NfaceVertices);
-            });
+  sort(boundaryFaces.ptr(), boundaryFaces.ptr()+bcnt,
+      [&](const boundaryFace_t& a, const boundaryFace_t& b) {
+        return std::lexicographical_compare(a.v, a.v+NfaceVertices,
+                                            b.v, b.v+NfaceVertices);
+      });
 
   /* scan through sorted face lists looking for element-boundary matches */
-  EToB = (int*) calloc(Nelements*Nfaces, sizeof(int));
-  for(dlong n=0;n<Nelements*Nfaces;++n) EToB[n] = -1;
+  EToB.malloc(Nelements*Nfaces, -1);
 
+  #pragma omp parallel for
   for(hlong cnt=0;cnt<bcnt-1;++cnt){
     if(std::equal(boundaryFaces[cnt].v, boundaryFaces[cnt].v+NfaceVertices,
                   boundaryFaces[cnt+1].v)){
-      dlong e = mymax(boundaryFaces[cnt].element, boundaryFaces[cnt+1].element);
-      int f   = mymax(boundaryFaces[cnt].face,    boundaryFaces[cnt+1].face);
+      dlong e = std::max(boundaryFaces[cnt].element, boundaryFaces[cnt+1].element);
+      int f   = std::max(boundaryFaces[cnt].face,    boundaryFaces[cnt+1].face);
 
       EToB[e*Nfaces+f] =
-        mymax(boundaryFaces[cnt].bctype, boundaryFaces[cnt+1].bctype);
+        std::max(boundaryFaces[cnt].bctype, boundaryFaces[cnt+1].bctype);
     }
   }
 
+  o_EToB = platform.malloc<int>(EToB);
+
 #if 0
   int cnt = 0;
   for(int e=0;e<Nelements;++e){
@@ -143,7 +153,6 @@ void mesh_t::ConnectBoundary(){
     }
   }
 #endif
-
-  free(boundaryFaces);
 }
 
+} //namespace libp
diff --git a/libs/mesh/meshConnectFaceNodes.cpp b/libs/mesh/meshConnectFaceNodes.cpp
new file mode 100644
index 000000000..b20c6a058
--- /dev/null
+++ b/libs/mesh/meshConnectFaceNodes.cpp
@@ -0,0 +1,103 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "mesh.hpp"
+
+namespace libp {
+
+// serial face-node to face-node connection
+void mesh_t::ConnectFaceNodes(){
+
+  /* Build the permutation array R */
+  memory<int> R;
+
+  switch (elementType) {
+    case Mesh::TRIANGLES:
+      FaceNodeMatchingTri2D(r, s, faceNodes, faceVertices, R);
+      break;
+    case Mesh::QUADRILATERALS:
+      FaceNodeMatchingQuad2D(r, s, faceNodes, faceVertices, R);
+      break;
+    case Mesh::TETRAHEDRA:
+      FaceNodeMatchingTet3D(r, s, t, faceNodes, faceVertices, R);
+      break;
+    case Mesh::HEXAHEDRA:
+      FaceNodeMatchingHex3D(r, s, t, faceNodes, faceVertices, R);
+      break;
+  }
+
+  /* volume indices of the interior and exterior face nodes for each element */
+  vmapM.malloc(Nfp*Nfaces*Nelements);
+  vmapP.malloc(Nfp*Nfaces*Nelements);
+  mapP.malloc(Nfp*Nfaces*Nelements);
+
+  /* assume elements already connected */
+  #pragma omp parallel for collapse(2)
+  for(dlong eM=0;eM<Nelements;++eM){
+    for(int fM=0;fM<Nfaces;++fM){
+      dlong eP = EToE[eM*Nfaces+fM];
+      int fP = EToF[eM*Nfaces+fM];
+      if(eP<0 || fP<0){ // fake connections for unconnected faces
+        for(int nM=0;nM<Nfp;++nM){
+          const int idM = faceNodes[fM*Nfp+nM];
+          const dlong id = eM*Nfaces*Nfp + fM*Nfp + nM;
+          vmapM[id] = idM + eM*Np;
+          vmapP[id] = idM + eM*Np;
+          mapP[id]  = id;
+        }
+      } else {
+        //Find the rotation of the face from where the first vertex of the face is
+        hlong vf0P = EToV[eP*Nverts + faceVertices[fP*NfaceVertices+0]];
+        int rot=0;
+        for (;rot<NfaceVertices;++rot) {
+          hlong vfM = EToV[eM*Nverts + faceVertices[fM*NfaceVertices+rot]];
+          if (vfM == vf0P) break;
+        }
+
+        /* for each node on this face use the permuation array
+           to select the neighbor node */
+        for(int nM=0;nM<Nfp;++nM){
+          const int nP  = R[fM*Nfaces*Nfp*NfaceVertices
+                            + fP*Nfp*NfaceVertices
+                            + rot*Nfp + nM];
+          const int idM = faceNodes[fM*Nfp+nM];
+          const int idP = faceNodes[fP*Nfp+nP];
+
+          const dlong id = eM*Nfaces*Nfp + fM*Nfp + nM;
+          vmapM[id] = idM + eM*Np;
+          vmapP[id] = idP + eP*Np;
+          mapP[id]  = eP*Nfaces*Nfp + fP*Nfp + nP;
+        }
+      }
+    }
+  }
+
+  o_vmapM = platform.malloc<dlong>(vmapM);
+  o_vmapP = platform.malloc<dlong>(vmapP);
+  o_mapP  = platform.malloc<dlong>(mapP);
+}
+
+} //namespace libp
diff --git a/libs/mesh/meshConnectFaceNodes2D.cpp b/libs/mesh/meshConnectFaceNodes2D.cpp
deleted file mode 100644
index 97f84b9f5..000000000
--- a/libs/mesh/meshConnectFaceNodes2D.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
-
-static int findBestMatch(dfloat x1, dfloat y1,
-                   int Np2, int *nodeList, dfloat *x2, dfloat *y2, int *nP){
-
-  int matchIndex = nodeList[0];
-  dfloat mindist2 = pow(x1-x2[nodeList[0]],2) + pow(y1-y2[nodeList[0]],2);
-
-  *nP = 0;
-  for(int n=1;n<Np2;++n){
-
-    /* next node */
-    const int i2 = nodeList[n];
-
-    /* distance between target and next node */
-    const dfloat dist2 = pow(x1-x2[i2],2) + pow(y1-y2[i2],2);
-
-    /* if next node is closer to target update match */
-    if(dist2<mindist2){
-      mindist2 = dist2;
-      matchIndex = i2;
-      *nP = n;
-    }
-  }
-  if(mindist2>1e-3) {
-    stringstream ss;
-    ss << "Bad match: x,y = " << x1 << ", " << y1 << "\n";
-    LIBP_ABORT(ss.str())
-  }
-  return matchIndex;
-}
-
-// serial face-node to face-node connection
-void mesh2D::ConnectFaceNodes(){
-
-  /* volume indices of the interior and exterior face nodes for each element */
-  vmapM = (dlong*) calloc(Nfp*Nfaces*Nelements, sizeof(dlong));
-  vmapP = (dlong*) calloc(Nfp*Nfaces*Nelements, sizeof(dlong));
-  mapP  = (dlong*) calloc(Nfp*Nfaces*Nelements, sizeof(dlong));
-
-  //check if we're connecting a periodic box mesh
-  int periodicFlag = 0;
-  if (settings.compareSetting("MESH FILE","BOX") &&
-      settings.compareSetting("BOX BOUNDARY FLAG","-1"))
-    periodicFlag = 1;
-
-  //box dimensions
-  dfloat DIMX, DIMY;
-  settings.getSetting("BOX DIMX", DIMX);
-  settings.getSetting("BOX DIMY", DIMY);
-
-  //box is centered at the origin
-  DIMX /= 2.0;
-  DIMY /= 2.0;
-
-  /* assume elements already connected */
-  for(dlong e=0;e<Nelements;++e){
-    for(int f=0;f<Nfaces;++f){
-      dlong eP = EToE[e*Nfaces+f];
-      int fP = EToF[e*Nfaces+f];
-      if(eP<0 || fP<0){ // fake connections for unconnected faces
-        eP = e;
-        fP = f;
-      }
-
-      dfloat offsetX = 0.0;
-      dfloat offsetY = 0.0;
-
-      if (periodicFlag) {
-        //if the mesh is periodic, this is more complicated.
-        // check if this face is on a boundary face
-        bool top=true, bottom=true, left=true, right=true;
-        for(int n=0;n<NfaceVertices;++n){
-          dlong vid = e*Nverts + faceVertices[f*NfaceVertices+n];
-          if (fabs(EX[vid]-DIMX)>1e-4) right = false;
-          if (fabs(EX[vid]+DIMX)>1e-4) left = false;
-          if (fabs(EY[vid]-DIMY)>1e-4) top = false;
-          if (fabs(EY[vid]+DIMY)>1e-4) bottom = false;
-        }
-
-        if (right)  offsetX = -2.0*DIMX;
-        if (left)   offsetX =  2.0*DIMX;
-        if (top)    offsetY = -2.0*DIMY;
-        if (bottom) offsetY =  2.0*DIMY;
-      }
-
-      /* for each node on this face find the neighbor node */
-      for(int n=0;n<Nfp;++n){
-        dlong idM = faceNodes[f*Nfp+n] + e*Np;
-        dfloat xM = x[idM]+offsetX;
-        dfloat yM = y[idM]+offsetY;
-        dlong  id = Nfaces*Nfp*e + f*Nfp + n;
-        int nP;
-
-        int idP = findBestMatch(xM, yM,
-                                  Nfp,
-                                  faceNodes+fP*Nfp,
-                                  x+eP*Np,
-                                  y+eP*Np, &nP);
-
-        vmapM[id] = idM;
-        vmapP[id] = idP + eP*Np;
-        mapP[id] = eP*Nfaces*Nfp + fP*Nfp + nP;
-
-      }
-    }
-  }
-}
-
-//      printf("connecting (%d,%d) to (%d,%d) [ vmapM %d to vmapP %d ]\n",
-//             e,f,eP,fP, vmapM[id], vmapP[id]);
diff --git a/libs/mesh/meshConnectFaceNodes3D.cpp b/libs/mesh/meshConnectFaceNodes3D.cpp
deleted file mode 100644
index dfffaa28a..000000000
--- a/libs/mesh/meshConnectFaceNodes3D.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
-
-static int findBestMatch(dfloat x1, dfloat y1, dfloat z1,
-                   int Np2, int *nodeList, dfloat *x2, dfloat *y2, dfloat *z2, int *nP){
-
-  int matchIndex=0;
-  dfloat mindist2=1e9;
-
-  for(int n=0;n<Np2;++n){
-
-    /* next node */
-    const int i2 = nodeList[n];
-
-    /* distance between target and next node */
-    const dfloat dist2 = pow(x1-x2[i2],2) + pow(y1-y2[i2],2) + pow(z1-z2[i2],2);
-
-    /* if next node is closer to target update match */
-    if(n==0 || dist2<mindist2){
-      mindist2 = dist2;
-      matchIndex = i2;
-      *nP = n;
-    }
-  }
-  if(mindist2>1e-3) {
-    stringstream ss;
-    ss << "Bad match: x,y,z = " << x1 << ", " << y1 << ", " << z1 << "\n";
-    LIBP_ABORT(ss.str())
-  }
-
-  return matchIndex;
-}
-
-
-// serial face-node to face-node connection
-void mesh3D::ConnectFaceNodes(){
-
-  /* volume indices of the interior and exterior face nodes for each element */
-  vmapM = (dlong*) calloc(Nfp*Nfaces*Nelements, sizeof(dlong));
-  vmapP = (dlong*) calloc(Nfp*Nfaces*Nelements, sizeof(dlong));
-  mapP  = (dlong*) calloc(Nfp*Nfaces*Nelements, sizeof(dlong));
-
-  //check if we're connecting a periodic box mesh
-  int periodicFlag = 0;
-  if (settings.compareSetting("MESH FILE","BOX") &&
-      settings.compareSetting("BOX BOUNDARY FLAG","-1"))
-    periodicFlag = 1;
-
-  //box dimensions
-  dfloat DIMX, DIMY, DIMZ;
-  settings.getSetting("BOX DIMX", DIMX);
-  settings.getSetting("BOX DIMY", DIMY);
-  settings.getSetting("BOX DIMZ", DIMZ);
-
-  //box is centered at the origin
-  DIMX /= 2.0;
-  DIMY /= 2.0;
-  DIMZ /= 2.0;
-
-  /* assume elements already connected */
-  for(dlong e=0;e<Nelements;++e){
-    for(int f=0;f<Nfaces;++f){
-      dlong eP = EToE[e*Nfaces+f];
-      int fP = EToF[e*Nfaces+f];
-      if(eP<0 || fP<0){ // fake connections for unconnected faces
-        eP = e;
-        fP = f;
-      }
-
-      dfloat offsetX = 0.0;
-      dfloat offsetY = 0.0;
-      dfloat offsetZ = 0.0;
-
-      if (periodicFlag) {
-        //if the mesh is periodic, this is more complicated.
-        // check if this face is on a boundary face
-        bool top=true, bottom=true, front=true, back=true, left=true, right=true;
-        for(int n=0;n<NfaceVertices;++n){
-          dlong vid = e*Nverts + faceVertices[f*NfaceVertices+n];
-          if (fabs(EX[vid]-DIMX)>1e-4) right = false;
-          if (fabs(EX[vid]+DIMX)>1e-4) left = false;
-          if (fabs(EY[vid]-DIMY)>1e-4) back = false;
-          if (fabs(EY[vid]+DIMY)>1e-4) front = false;
-          if (fabs(EZ[vid]-DIMZ)>1e-4) top = false;
-          if (fabs(EZ[vid]+DIMZ)>1e-4) bottom = false;
-        }
-
-        if (right)  offsetX = -2.0*DIMX;
-        if (left)   offsetX =  2.0*DIMX;
-        if (back)   offsetY = -2.0*DIMY;
-        if (front)  offsetY =  2.0*DIMY;
-        if (top)    offsetZ = -2.0*DIMZ;
-        if (bottom) offsetZ =  2.0*DIMZ;
-      }
-
-      /* for each node on this face find the neighbor node */
-      for(int n=0;n<Nfp;++n){
-        dlong  idM = faceNodes[f*Nfp+n] + e*Np;
-        dfloat xM = x[idM]+offsetX;
-        dfloat yM = y[idM]+offsetY;
-        dfloat zM = z[idM]+offsetZ;
-        int nP;
-
-        int  idP = findBestMatch(xM, yM, zM,
-				 Nfp,
-				 faceNodes+fP*Nfp,
-				 x+eP*Np,
-				 y+eP*Np,
-				 z+eP*Np, &nP);
-
-        dlong id = Nfaces*Nfp*e + f*Nfp + n;
-        vmapM[id] = idM;
-        vmapP[id] = idP + eP*Np;
-        mapP[id] = eP*Nfaces*Nfp + fP*Nfp + nP;
-      }
-    }
-  }
-}
-
diff --git a/libs/mesh/meshConnectFaceVertices.cpp b/libs/mesh/meshConnectFaceVertices.cpp
new file mode 100644
index 000000000..fb6c871bd
--- /dev/null
+++ b/libs/mesh/meshConnectFaceVertices.cpp
@@ -0,0 +1,66 @@
+/*
+The MIT License (MIT)
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "mesh.hpp"
+
+namespace libp {
+
+// serial face-vertex to face-vertex connection
+void mesh_t::ConnectFaceVertices(){
+
+  //allocate and fill a halo region in element-to-vertex mapping
+  EToV.realloc((Nelements+totalHaloPairs)*Nverts);
+  halo.Exchange(EToV, Nverts);
+
+  /* volume indices of the interior and exterior face vertices for each element */
+  VmapM.malloc(NfaceVertices*Nfaces*Nelements);
+  VmapP.malloc(NfaceVertices*Nfaces*Nelements);
+
+  /* assume elements already connected */
+  #pragma omp parallel for collapse(2)
+  for(dlong e=0;e<Nelements;++e){
+    for(int f=0;f<Nfaces;++f){
+      dlong eP = EToE[e*Nfaces+f];
+      int fP = EToF[e*Nfaces+f];
+      if(eP<0 || fP<0){ // fake connections for unconnected faces
+        eP = e;
+        fP = f;
+      }
+
+      /* for each vertex on this face find the neighbor vertex */
+      for(int n=0;n<NfaceVertices;++n){
+        dlong idM = faceVertices[f*NfaceVertices+n] + e*Nverts;
+        hlong vM  = EToV[idM];
+
+        dlong idP=idM;
+        for(int m=0;m<NfaceVertices;++m){
+          idP = faceVertices[fP*NfaceVertices+m] + eP*Nverts;
+          if (EToV[idP]==vM) break;
+        }
+
+        dlong id = Nfaces*NfaceVertices*e + f*NfaceVertices + n;
+        VmapM[id] = idM;
+        VmapP[id] = idP;
+      }
+    }
+  }
+}
+
+} //namespace libp
diff --git a/libs/mesh/meshConnectNodes.cpp b/libs/mesh/meshConnectNodes.cpp
new file mode 100644
index 000000000..64d8a6fc3
--- /dev/null
+++ b/libs/mesh/meshConnectNodes.cpp
@@ -0,0 +1,119 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "mesh.hpp"
+
+namespace libp {
+
+// uniquely label each node with a global index, used for gatherScatter
+void mesh_t::ConnectNodes(){
+
+  hlong localNnodes = Np*Nelements;
+  hlong gatherNodeStart = localNnodes;
+  comm.Scan(localNnodes, gatherNodeStart);
+  gatherNodeStart -= localNnodes;
+
+  // form global node numbering
+  globalIds.malloc((totalHaloPairs+Nelements)*Np);
+
+  // initialize with local numbering
+  #pragma omp parallel for
+  for(dlong n=0;n<Nelements*Np;++n){
+    globalIds[n] = 1 + n + gatherNodeStart;
+  }
+
+  //make a node-wise bc flag by looking at all neighbors
+  mapB.malloc((Nelements+totalHaloPairs)*Np, -1);
+
+  #pragma omp parallel for
+  for (dlong e=0;e<Nelements;e++) {
+    for (int f=0;f<Nfaces;f++) {
+      int bc = EToB[f+e*Nfaces];
+      if (bc>0) {
+        for (int n=0;n<Nfp;n++) {
+          const int fid = faceNodes[n+f*Nfp];
+          int bcn = mapB[fid+e*Np];
+          if (bcn == -1) { //if theres no bc here yet, write it
+            mapB[fid+e*Np] = bc;
+          } else { //if theres a bc, take the min
+            mapB[fid+e*Np] = std::min(bc,bcn);
+          }
+        }
+      }
+    }
+  }
+
+  hlong gatherChange = 1;
+
+  // keep comparing numbers on positive and negative traces until convergence
+  while(gatherChange>0){
+
+    // reset change counter
+    gatherChange = 0;
+
+    // send halo data and recv into extension of buffer
+    halo.Exchange(globalIds, Np);
+    halo.Exchange(mapB, Np);
+
+    // compare trace nodes
+    // #pragma omp parallel for
+    for(dlong e=0;e<Nelements;++e){
+
+      for(int n=0;n<Nfp*Nfaces;++n){
+        dlong id  = e*Nfp*Nfaces + n;
+        dlong idM = vmapM[id];
+        dlong idP = vmapP[id];
+        hlong gidM = globalIds[idM];
+        hlong gidP = globalIds[idP];
+        int bcM = mapB[idM];
+        int bcP = mapB[idP];
+
+        if(gidP<gidM){
+          ++gatherChange;
+          globalIds[idM] = gidP;
+        }
+
+        if (bcP > 0) {
+          if (bcM == -1) {
+            //if theres no bc here yet, write it
+            mapB[idM] = bcP;
+            ++gatherChange;
+          } else if (bcP<bcM) {
+            mapB[idM] = bcP;
+            ++gatherChange;
+          }
+        }
+      }
+    }
+
+    // sum up changes
+    comm.Allreduce(gatherChange);
+  }
+
+  o_mapB = platform.malloc<int>(mapB);
+}
+
+} //namespace libp
diff --git a/libs/mesh/meshCubatureNodesHex3D.cpp b/libs/mesh/meshCubatureNodesHex3D.cpp
index 5c9848a91..384ee5dd2 100644
--- a/libs/mesh/meshCubatureNodesHex3D.cpp
+++ b/libs/mesh/meshCubatureNodesHex3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,31 +25,32 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
 
-void meshHex3D::CubatureNodes(){
+namespace libp {
 
-  cubx = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat));
-  cuby = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat));
-  cubz = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat));
+void mesh_t::CubaturePhysicalNodesHex3D(){
+
+  cubx.malloc(Nelements*cubNp);
+  cuby.malloc(Nelements*cubNp);
+  cubz.malloc(Nelements*cubNp);
 
   //temp arrays
-  dfloat *Ix1 = (dfloat*) calloc(Nq*Nq*cubNq, sizeof(dfloat));
-  dfloat *Iy1 = (dfloat*) calloc(Nq*Nq*cubNq, sizeof(dfloat));
-  dfloat *Iz1 = (dfloat*) calloc(Nq*Nq*cubNq, sizeof(dfloat));
+  memory<dfloat> Ix1(Nq*Nq*cubNq);
+  memory<dfloat> Iy1(Nq*Nq*cubNq);
+  memory<dfloat> Iz1(Nq*Nq*cubNq);
 
-  dfloat *Ix2 = (dfloat*) calloc(Nq*cubNq*cubNq, sizeof(dfloat));
-  dfloat *Iy2 = (dfloat*) calloc(Nq*cubNq*cubNq, sizeof(dfloat));
-  dfloat *Iz2 = (dfloat*) calloc(Nq*cubNq*cubNq, sizeof(dfloat));
+  memory<dfloat> Ix2(Nq*cubNq*cubNq);
+  memory<dfloat> Iy2(Nq*cubNq*cubNq);
+  memory<dfloat> Iz2(Nq*cubNq*cubNq);
 
   for(dlong e=0;e<Nelements;++e){ /* for each element */
 
-    dfloat *xe = x + e*Np;
-    dfloat *ye = y + e*Np;
-    dfloat *ze = z + e*Np;
-    dfloat *cubxe = cubx + e*cubNp;
-    dfloat *cubye = cuby + e*cubNp;
-    dfloat *cubze = cubz + e*cubNp;
+    dfloat *xe = x.ptr() + e*Np;
+    dfloat *ye = y.ptr() + e*Np;
+    dfloat *ze = z.ptr() + e*Np;
+    dfloat *cubxe = cubx.ptr() + e*cubNp;
+    dfloat *cubye = cuby.ptr() + e*cubNp;
+    dfloat *cubze = cubz.ptr() + e*cubNp;
 
     //interpolate physical coordinates to cubature
     for(int k=0;k<Nq;++k){
@@ -98,21 +99,18 @@ void meshHex3D::CubatureNodes(){
     }
   }
 
-  free(Ix1); free(Iy1); free(Iz1);
-  free(Ix2); free(Iy2); free(Iz2);
-
-  o_cubx = platform.malloc(Nelements*cubNp*sizeof(dfloat), cubx);
-  o_cuby = platform.malloc(Nelements*cubNp*sizeof(dfloat), cuby);
-  o_cubz = platform.malloc(Nelements*cubNp*sizeof(dfloat), cubz);
+  o_cubx = platform.malloc<dfloat>(Nelements*cubNp, cubx);
+  o_cuby = platform.malloc<dfloat>(Nelements*cubNp, cuby);
+  o_cubz = platform.malloc<dfloat>(Nelements*cubNp, cubz);
 
   //Face cubature
-  intx = (dfloat*) calloc(Nelements*Nfaces*cubNfp, sizeof(dfloat));
-  inty = (dfloat*) calloc(Nelements*Nfaces*cubNfp, sizeof(dfloat));
-  intz = (dfloat*) calloc(Nelements*Nfaces*cubNfp, sizeof(dfloat));
+  intx.malloc(Nelements*Nfaces*cubNfp);
+  inty.malloc(Nelements*Nfaces*cubNfp);
+  intz.malloc(Nelements*Nfaces*cubNfp);
 
-  dfloat *ix = (dfloat *) calloc(cubNq*Nq,sizeof(dfloat));
-  dfloat *iy = (dfloat *) calloc(cubNq*Nq,sizeof(dfloat));
-  dfloat *iz = (dfloat *) calloc(cubNq*Nq,sizeof(dfloat));
+  memory<dfloat> ix(cubNq*Nq);
+  memory<dfloat> iy(cubNq*Nq);
+  memory<dfloat> iz(cubNq*Nq);
   for(dlong e=0;e<Nelements;++e){
     for(int f=0;f<Nfaces;++f){
       //interpolate in i
@@ -162,9 +160,10 @@ void meshHex3D::CubatureNodes(){
       }
     }
   }
-  free(ix); free(iy); free(iz);
 
-  o_intx = platform.malloc(Nelements*Nfaces*cubNfp*sizeof(dfloat), intx);
-  o_inty = platform.malloc(Nelements*Nfaces*cubNfp*sizeof(dfloat), inty);
-  o_intz = platform.malloc(Nelements*Nfaces*cubNfp*sizeof(dfloat), intz);
+  o_intx = platform.malloc<dfloat>(Nelements*Nfaces*cubNfp, intx);
+  o_inty = platform.malloc<dfloat>(Nelements*Nfaces*cubNfp, inty);
+  o_intz = platform.malloc<dfloat>(Nelements*Nfaces*cubNfp, intz);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshCubatureNodesQuad2D.cpp b/libs/mesh/meshCubatureNodesQuad2D.cpp
index 70f40f46b..eecfa33a3 100644
--- a/libs/mesh/meshCubatureNodesQuad2D.cpp
+++ b/libs/mesh/meshCubatureNodesQuad2D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,23 +25,24 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
 
-void meshQuad2D::CubatureNodes(){
+namespace libp {
 
-  cubx = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat));
-  cuby = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat));
+void mesh_t::CubaturePhysicalNodesQuad2D(){
+
+  cubx.malloc(Nelements*cubNp);
+  cuby.malloc(Nelements*cubNp);
 
   //temp arrays
-  dfloat *Ix1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat));
-  dfloat *Iy1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat));
+  memory<dfloat> Ix1(Nq*cubNq);
+  memory<dfloat> Iy1(Nq*cubNq);
 
   for(dlong e=0;e<Nelements;++e){ /* for each element */
 
-    dfloat *xe = x + e*Np;
-    dfloat *ye = y + e*Np;
-    dfloat *cubxe = cubx + e*cubNp;
-    dfloat *cubye = cuby + e*cubNp;
+    dfloat *xe = x.ptr() + e*Np;
+    dfloat *ye = y.ptr() + e*Np;
+    dfloat *cubxe = cubx.ptr() + e*cubNp;
+    dfloat *cubye = cuby.ptr() + e*cubNp;
 
     //interpolate physical coordinates to cubature
     for(int j=0;j<Nq;++j){
@@ -67,16 +68,12 @@ void meshQuad2D::CubatureNodes(){
     }
   }
 
-  free(Ix1);
-  free(Iy1);
-
-  o_cubx = platform.malloc(Nelements*cubNp*sizeof(dfloat), cubx);
-  o_cuby = platform.malloc(Nelements*cubNp*sizeof(dfloat), cuby);
-  o_cubz = o_cuby; // dummy to align with 3d
+  o_cubx = platform.malloc<dfloat>(Nelements*cubNp, cubx);
+  o_cuby = platform.malloc<dfloat>(Nelements*cubNp, cuby);
 
   //Face cubature
-  intx = (dfloat*) calloc(Nelements*Nfaces*cubNq, sizeof(dfloat));
-  inty = (dfloat*) calloc(Nelements*Nfaces*cubNq, sizeof(dfloat));
+  intx.malloc(Nelements*Nfaces*cubNq);
+  inty.malloc(Nelements*Nfaces*cubNq);
   for(dlong e=0;e<Nelements;++e){
     for(int f=0;f<Nfaces;++f){
       for(int n=0;n<cubNq;++n){
@@ -97,7 +94,8 @@ void meshQuad2D::CubatureNodes(){
     }
   }
 
-  o_intx = platform.malloc(Nelements*Nfaces*cubNq*sizeof(dfloat), intx);
-  o_inty = platform.malloc(Nelements*Nfaces*cubNq*sizeof(dfloat), inty);
-  o_intz = o_inty; // dummy to align with 3d
+  o_intx = platform.malloc<dfloat>(Nelements*Nfaces*cubNq, intx);
+  o_inty = platform.malloc<dfloat>(Nelements*Nfaces*cubNq, inty);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshCubatureNodesQuad3D.cpp b/libs/mesh/meshCubatureNodesQuad3D.cpp
index f50bca720..c0210f631 100644
--- a/libs/mesh/meshCubatureNodesQuad3D.cpp
+++ b/libs/mesh/meshCubatureNodesQuad3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,27 +25,28 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
 
-void meshQuad3D::CubatureNodes(){
+namespace libp {
 
-  cubx = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat));
-  cuby = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat));
-  cubz = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat));
+void mesh_t::CubaturePhysicalNodesQuad3D(){
+
+  cubx.malloc(Nelements*cubNp);
+  cuby.malloc(Nelements*cubNp);
+  cubz.malloc(Nelements*cubNp);
 
   //temp arrays
-  dfloat *Ix1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat));
-  dfloat *Iy1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat));
-  dfloat *Iz1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat));
+  memory<dfloat> Ix1(Nq*cubNq);
+  memory<dfloat> Iy1(Nq*cubNq);
+  memory<dfloat> Iz1(Nq*cubNq);
 
   for(dlong e=0;e<Nelements;++e){ /* for each element */
 
-    dfloat *xe = x + e*Np;
-    dfloat *ye = y + e*Np;
-    dfloat *ze = z + e*Np;
-    dfloat *cubxe = cubx + e*cubNp;
-    dfloat *cubye = cuby + e*cubNp;
-    dfloat *cubze = cubz + e*cubNp;
+    dfloat *xe = x.ptr() + e*Np;
+    dfloat *ye = y.ptr() + e*Np;
+    dfloat *ze = z.ptr() + e*Np;
+    dfloat *cubxe = cubx.ptr() + e*cubNp;
+    dfloat *cubye = cuby.ptr() + e*cubNp;
+    dfloat *cubze = cubz.ptr() + e*cubNp;
 
     //interpolate physical coordinates to cubature
     for(int j=0;j<Nq;++j){
@@ -81,17 +82,14 @@ void meshQuad3D::CubatureNodes(){
     }
   }
 
-  free(Ix1);
-  free(Iy1);
-
-  o_cubx = platform.malloc(Nelements*cubNp*sizeof(dfloat), cubx);
-  o_cuby = platform.malloc(Nelements*cubNp*sizeof(dfloat), cuby);
-  o_cubz = platform.malloc(Nelements*cubNp*sizeof(dfloat), cubz);
+  o_cubx = platform.malloc<dfloat>(Nelements*cubNp, cubx);
+  o_cuby = platform.malloc<dfloat>(Nelements*cubNp, cuby);
+  o_cubz = platform.malloc<dfloat>(Nelements*cubNp, cubz);
 
   //Face cubature
-  intx = (dfloat*) calloc(Nelements*Nfaces*cubNq, sizeof(dfloat));
-  inty = (dfloat*) calloc(Nelements*Nfaces*cubNq, sizeof(dfloat));
-  intz = (dfloat*) calloc(Nelements*Nfaces*cubNq, sizeof(dfloat));
+  intx.malloc(Nelements*Nfaces*cubNq);
+  inty.malloc(Nelements*Nfaces*cubNq);
+  intz.malloc(Nelements*Nfaces*cubNq);
   for(dlong e=0;e<Nelements;++e){
     for(int f=0;f<Nfaces;++f){
       for(int n=0;n<cubNq;++n){
@@ -117,7 +115,9 @@ void meshQuad3D::CubatureNodes(){
     }
   }
 
-  o_intx = platform.malloc(Nelements*Nfaces*cubNq*sizeof(dfloat), intx);
-  o_inty = platform.malloc(Nelements*Nfaces*cubNq*sizeof(dfloat), inty);
-  o_intz = platform.malloc(Nelements*Nfaces*cubNq*sizeof(dfloat), intz);
+  o_intx = platform.malloc<dfloat>(Nelements*Nfaces*cubNq, intx);
+  o_inty = platform.malloc<dfloat>(Nelements*Nfaces*cubNq, inty);
+  o_intz = platform.malloc<dfloat>(Nelements*Nfaces*cubNq, intz);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshCubatureNodesTet3D.cpp b/libs/mesh/meshCubatureNodesTet3D.cpp
index a3cfccd8f..3e83718c0 100644
--- a/libs/mesh/meshCubatureNodesTet3D.cpp
+++ b/libs/mesh/meshCubatureNodesTet3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,14 +25,15 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
 
-void meshTet3D::CubatureNodes(){
+namespace libp {
+
+void mesh_t::CubaturePhysicalNodesTet3D(){
 
   if(cubNp){
-    cubx = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat));
-    cuby = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat));
-    cubz = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat));
+    cubx.malloc(Nelements*cubNp);
+    cuby.malloc(Nelements*cubNp);
+    cubz.malloc(Nelements*cubNp);
 
     dlong cnt = 0;
     for(dlong e=0;e<Nelements;++e){ /* for each element */
@@ -69,17 +70,17 @@ void meshTet3D::CubatureNodes(){
       }
     }
 
-    o_cubx = platform.malloc(Nelements*cubNp*sizeof(dfloat), cubx);
-    o_cuby = platform.malloc(Nelements*cubNp*sizeof(dfloat), cuby);
-    o_cubz = platform.malloc(Nelements*cubNp*sizeof(dfloat), cubz);
+    o_cubx = platform.malloc<dfloat>(Nelements*cubNp, cubx);
+    o_cuby = platform.malloc<dfloat>(Nelements*cubNp, cuby);
+    o_cubz = platform.malloc<dfloat>(Nelements*cubNp, cubz);
   }
 
   //Face cubature
   if(intNfp){
     // printf("Integration number of points: %d \n",intNfp);
-    intx = (dfloat*) calloc(Nelements*Nfaces*intNfp, sizeof(dfloat));
-    inty = (dfloat*) calloc(Nelements*Nfaces*intNfp, sizeof(dfloat));
-    intz = (dfloat*) calloc(Nelements*Nfaces*intNfp, sizeof(dfloat));
+    intx.malloc(Nelements*Nfaces*intNfp);
+    inty.malloc(Nelements*Nfaces*intNfp);
+    intz.malloc(Nelements*Nfaces*intNfp);
 
     for(dlong e=0;e<Nelements;++e){
       for(int f=0;f<Nfaces;++f){
@@ -103,17 +104,10 @@ void meshTet3D::CubatureNodes(){
       }
     }
 
-    o_intx =
-      platform.malloc(Nelements*Nfaces*intNfp*sizeof(dfloat),
-                          intx);
-
-    o_inty =
-      platform.malloc(Nelements*Nfaces*intNfp*sizeof(dfloat),
-                          inty);
-
-    o_intz =
-      platform.malloc(Nelements*Nfaces*intNfp*sizeof(dfloat),
-                          intz);
-
+    o_intx = platform.malloc<dfloat>(Nelements*Nfaces*intNfp, intx);
+    o_inty = platform.malloc<dfloat>(Nelements*Nfaces*intNfp, inty);
+    o_intz = platform.malloc<dfloat>(Nelements*Nfaces*intNfp, intz);
   }
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshCubatureNodesTri2D.cpp b/libs/mesh/meshCubatureNodesTri2D.cpp
index 2d3d9abfb..66ed2f959 100644
--- a/libs/mesh/meshCubatureNodesTri2D.cpp
+++ b/libs/mesh/meshCubatureNodesTri2D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,12 +25,13 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
 
-void meshTri2D::CubatureNodes(){
+namespace libp {
 
-  cubx = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat));
-  cuby = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat));
+void mesh_t::CubaturePhysicalNodesTri2D(){
+
+  cubx.malloc(Nelements*cubNp);
+  cuby.malloc(Nelements*cubNp);
 
   dlong cnt = 0;
   for(dlong e=0;e<Nelements;++e){ /* for each element */
@@ -58,13 +59,12 @@ void meshTri2D::CubatureNodes(){
     }
   }
 
-  o_cubx = platform.malloc(Nelements*cubNp*sizeof(dfloat), cubx);
-  o_cuby = platform.malloc(Nelements*cubNp*sizeof(dfloat), cuby);
-  o_cubz = o_cuby; // dummy to align with 3d
+  o_cubx = platform.malloc<dfloat>(Nelements*cubNp, cubx);
+  o_cuby = platform.malloc<dfloat>(Nelements*cubNp, cuby);
 
   //Face cubature
-  intx = (dfloat*) calloc(Nelements*Nfaces*intNfp, sizeof(dfloat));
-  inty = (dfloat*) calloc(Nelements*Nfaces*intNfp, sizeof(dfloat));
+  intx.malloc(Nelements*Nfaces*intNfp);
+  inty.malloc(Nelements*Nfaces*intNfp);
   for(dlong e=0;e<Nelements;++e){
     for(int f=0;f<Nfaces;++f){
       for(int n=0;n<intNfp;++n){
@@ -84,7 +84,8 @@ void meshTri2D::CubatureNodes(){
     }
   }
 
-  o_intx = platform.malloc(Nelements*Nfaces*intNfp*sizeof(dfloat), intx);
-  o_inty = platform.malloc(Nelements*Nfaces*intNfp*sizeof(dfloat), inty);
-  o_intz = o_inty; // dummy to align with 3d
+  o_intx = platform.malloc<dfloat>(Nelements*Nfaces*intNfp, intx);
+  o_inty = platform.malloc<dfloat>(Nelements*Nfaces*intNfp, inty);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshCubatureNodesTri3D.cpp b/libs/mesh/meshCubatureNodesTri3D.cpp
index 065fd1396..6203b971b 100644
--- a/libs/mesh/meshCubatureNodesTri3D.cpp
+++ b/libs/mesh/meshCubatureNodesTri3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,13 +25,14 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
 
-void meshTri3D::CubatureNodes(){
+namespace libp {
 
-  cubx = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat));
-  cuby = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat));
-  cubz = (dfloat*) calloc(Nelements*cubNp,sizeof(dfloat));
+void mesh_t::CubaturePhysicalNodesTri3D(){
+
+  cubx.malloc(Nelements*cubNp);
+  cuby.malloc(Nelements*cubNp);
+  cubz.malloc(Nelements*cubNp);
 
   dlong cnt = 0;
   for(dlong e=0;e<Nelements;++e){ /* for each element */
@@ -70,14 +71,14 @@ void meshTri3D::CubatureNodes(){
     }
   }
 
-  o_cubx = platform.malloc(Nelements*cubNp*sizeof(dfloat), cubx);
-  o_cuby = platform.malloc(Nelements*cubNp*sizeof(dfloat), cuby);
-  o_cubz = platform.malloc(Nelements*cubNp*sizeof(dfloat), cubz);
+  o_cubx = platform.malloc<dfloat>(Nelements*cubNp, cubx);
+  o_cuby = platform.malloc<dfloat>(Nelements*cubNp, cuby);
+  o_cubz = platform.malloc<dfloat>(Nelements*cubNp, cubz);
 
   //Face cubature
-  intx = (dfloat*) calloc(Nelements*Nfaces*intNfp, sizeof(dfloat));
-  inty = (dfloat*) calloc(Nelements*Nfaces*intNfp, sizeof(dfloat));
-  intz = (dfloat*) calloc(Nelements*Nfaces*intNfp, sizeof(dfloat));
+  intx.malloc(Nelements*Nfaces*intNfp);
+  inty.malloc(Nelements*Nfaces*intNfp);
+  intz.malloc(Nelements*Nfaces*intNfp);
   for(dlong e=0;e<Nelements;++e){
     for(int f=0;f<Nfaces;++f){
       for(int n=0;n<intNfp;++n){
@@ -102,7 +103,9 @@ void meshTri3D::CubatureNodes(){
     }
   }
 
-  o_intx = platform.malloc(Nelements*Nfaces*intNfp*sizeof(dfloat), intx);
-  o_inty = platform.malloc(Nelements*Nfaces*intNfp*sizeof(dfloat), inty);
-  o_intz = platform.malloc(Nelements*Nfaces*intNfp*sizeof(dfloat), intz);
+  o_intx = platform.malloc<dfloat>(Nelements*Nfaces*intNfp, intx);
+  o_inty = platform.malloc<dfloat>(Nelements*Nfaces*intNfp, inty);
+  o_intz = platform.malloc<dfloat>(Nelements*Nfaces*intNfp, intz);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshCubatureSetupHex3D.cpp b/libs/mesh/meshCubatureSetupHex3D.cpp
index d2e05db24..a0dd5bc62 100644
--- a/libs/mesh/meshCubatureSetupHex3D.cpp
+++ b/libs/mesh/meshCubatureSetupHex3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,9 +25,10 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
 
-void meshHex3D::CubatureSetup(){
+namespace libp {
+
+void mesh_t::CubatureSetupHex3D(){
 
   /* Quadrature data */
   cubN = N+1;
@@ -37,24 +38,19 @@ void meshHex3D::CubatureSetup(){
   intNfp = cubNq*cubNq;
 
   // cubN+1 point Gauss-Legendre quadrature
-  cubr = (dfloat *) malloc(cubNq*sizeof(dfloat));
-  cubw = (dfloat *) malloc(cubNq*sizeof(dfloat));
   JacobiGQ(0, 0, cubN, cubr, cubw);
 
   // GLL to GL interpolation matrix
-  cubInterp = (dfloat *) malloc(Nq*cubNq*sizeof(dfloat));
-  InterpolationMatrix1D(N, Nq, r, cubNq, cubr, cubInterp); //uses the fact that r = gllz for 1:Nq
+  InterpolationMatrix1D(N, gllz, cubr, cubInterp);
 
   //cubature project cubProject = cubInterp^T
-  cubProject = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat));
-  matrixTranspose(cubNq, Nq, cubInterp, Nq, cubProject, cubNq);
+  cubProject.malloc(cubNq*Nq);
+  linAlg_t::matrixTranspose(cubNq, Nq, cubInterp, Nq, cubProject, cubNq);
 
   //cubature derivates matrix, cubD: differentiate on cubature nodes
-  cubD = (dfloat *) malloc(cubNq*cubNq*sizeof(dfloat));
-  Dmatrix1D(cubN, cubNq, cubr, cubNq, cubr, cubD);
+  Dmatrix1D(cubN, cubr, cubr, cubD);
 
   // weak cubature derivative cubPDT = cubProject * cubD^T
-  cubPDT  = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat));
   CubatureWeakDmatrix1D(Nq, cubNq, cubProject, cubD, cubPDT);
 
   // add compile time constants to kernels
@@ -65,73 +61,69 @@ void meshHex3D::CubatureSetup(){
   props["defines/" "p_cubNfp"]= cubNfp;
 
   // build transposes (we hold matrices as column major on device)
-  dfloat *cubProjectT = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat));
-  dfloat *cubInterpT   = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat));
-  matrixTranspose(cubNq, Nq, cubInterp, Nq, cubInterpT, cubNq);
-  matrixTranspose(Nq, cubNq, cubProject, cubNq, cubProjectT, Nq);
+  memory<dfloat> cubProjectT(cubNq*Nq);
+  memory<dfloat> cubInterpT(cubNq*Nq);
+  linAlg_t::matrixTranspose(cubNq, Nq, cubInterp, Nq, cubInterpT, cubNq);
+  linAlg_t::matrixTranspose(Nq, cubNq, cubProject, cubNq, cubProjectT, Nq);
 
-  dfloat *cubPDTT     = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat));
-  matrixTranspose(Nq, cubNq, cubPDT, cubNq, cubPDTT, Nq);
+  memory<dfloat> cubPDTT(cubNq*Nq);
+  linAlg_t::matrixTranspose(Nq, cubNq, cubPDT, cubNq, cubPDTT, Nq);
 
-  o_cubInterp   = platform.malloc(Nq*cubNq*sizeof(dfloat), cubInterpT);
-  o_cubProject = platform.malloc(Nq*cubNq*sizeof(dfloat), cubProjectT);
+  o_cubInterp  = platform.malloc<dfloat>(Nq*cubNq, cubInterpT);
+  o_cubProject = platform.malloc<dfloat>(Nq*cubNq, cubProjectT);
 
-  o_cubPDT = platform.malloc(Nq*cubNq*sizeof(dfloat), cubPDTT);
-  o_cubD = platform.malloc(cubNq*cubNq*sizeof(dfloat), cubD);
+  o_cubPDT = platform.malloc<dfloat>(Nq*cubNq, cubPDTT);
+  o_cubD   = platform.malloc<dfloat>(cubNq*cubNq, cubD);
 
   o_intInterp = o_cubInterp;
   o_intLIFT = o_cubProject;
 
-  free(cubPDTT);
-  free(cubProjectT);
-  free(cubInterpT);
-
-  cubvgeo = (dfloat*) calloc(Nelements*Nvgeo*cubNp, sizeof(dfloat));
-  cubggeo = (dfloat*) calloc(Nelements*Nggeo*cubNp, sizeof(dfloat));
-
-  cubsgeo = (dfloat*) calloc(Nelements*Nsgeo*cubNq*cubNq*Nfaces, sizeof(dfloat));
+  cubwJ.malloc(Nelements*cubNp);
+  cubvgeo.malloc(Nelements*Nvgeo*cubNp);
+  cubggeo.malloc(Nelements*Nggeo*cubNp);
+  cubsgeo.malloc(Nelements*Nsgeo*cubNq*cubNq*Nfaces);
 
   //temp arrays
-  dfloat *xre = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *xse = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *xte = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *yre = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *yse = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *yte = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *zre = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *zse = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *zte = (dfloat*) calloc(Np, sizeof(dfloat));
-
-  dfloat *xre1 = (dfloat*) calloc(Nq*Nq*cubNq, sizeof(dfloat));
-  dfloat *xse1 = (dfloat*) calloc(Nq*Nq*cubNq, sizeof(dfloat));
-  dfloat *xte1 = (dfloat*) calloc(Nq*Nq*cubNq, sizeof(dfloat));
-  dfloat *yre1 = (dfloat*) calloc(Nq*Nq*cubNq, sizeof(dfloat));
-  dfloat *yse1 = (dfloat*) calloc(Nq*Nq*cubNq, sizeof(dfloat));
-  dfloat *yte1 = (dfloat*) calloc(Nq*Nq*cubNq, sizeof(dfloat));
-  dfloat *zre1 = (dfloat*) calloc(Nq*Nq*cubNq, sizeof(dfloat));
-  dfloat *zse1 = (dfloat*) calloc(Nq*Nq*cubNq, sizeof(dfloat));
-  dfloat *zte1 = (dfloat*) calloc(Nq*Nq*cubNq, sizeof(dfloat));
-
-  dfloat *xre2 = (dfloat*) calloc(Nq*cubNq*cubNq, sizeof(dfloat));
-  dfloat *xse2 = (dfloat*) calloc(Nq*cubNq*cubNq, sizeof(dfloat));
-  dfloat *xte2 = (dfloat*) calloc(Nq*cubNq*cubNq, sizeof(dfloat));
-  dfloat *yre2 = (dfloat*) calloc(Nq*cubNq*cubNq, sizeof(dfloat));
-  dfloat *yse2 = (dfloat*) calloc(Nq*cubNq*cubNq, sizeof(dfloat));
-  dfloat *yte2 = (dfloat*) calloc(Nq*cubNq*cubNq, sizeof(dfloat));
-  dfloat *zre2 = (dfloat*) calloc(Nq*cubNq*cubNq, sizeof(dfloat));
-  dfloat *zse2 = (dfloat*) calloc(Nq*cubNq*cubNq, sizeof(dfloat));
-  dfloat *zte2 = (dfloat*) calloc(Nq*cubNq*cubNq, sizeof(dfloat));
+  memory<dfloat> xre(Np);
+  memory<dfloat> xse(Np);
+  memory<dfloat> xte(Np);
+  memory<dfloat> yre(Np);
+  memory<dfloat> yse(Np);
+  memory<dfloat> yte(Np);
+  memory<dfloat> zre(Np);
+  memory<dfloat> zse(Np);
+  memory<dfloat> zte(Np);
+
+  memory<dfloat> xre1(Nq*Nq*cubNq);
+  memory<dfloat> xse1(Nq*Nq*cubNq);
+  memory<dfloat> xte1(Nq*Nq*cubNq);
+  memory<dfloat> yre1(Nq*Nq*cubNq);
+  memory<dfloat> yse1(Nq*Nq*cubNq);
+  memory<dfloat> yte1(Nq*Nq*cubNq);
+  memory<dfloat> zre1(Nq*Nq*cubNq);
+  memory<dfloat> zse1(Nq*Nq*cubNq);
+  memory<dfloat> zte1(Nq*Nq*cubNq);
+
+  memory<dfloat> xre2(Nq*cubNq*cubNq);
+  memory<dfloat> xse2(Nq*cubNq*cubNq);
+  memory<dfloat> xte2(Nq*cubNq*cubNq);
+  memory<dfloat> yre2(Nq*cubNq*cubNq);
+  memory<dfloat> yse2(Nq*cubNq*cubNq);
+  memory<dfloat> yte2(Nq*cubNq*cubNq);
+  memory<dfloat> zre2(Nq*cubNq*cubNq);
+  memory<dfloat> zse2(Nq*cubNq*cubNq);
+  memory<dfloat> zte2(Nq*cubNq*cubNq);
 
   //surface temp arrays
-  dfloat *xr1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat));
-  dfloat *xs1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat));
-  dfloat *xt1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat));
-  dfloat *yr1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat));
-  dfloat *ys1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat));
-  dfloat *yt1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat));
-  dfloat *zr1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat));
-  dfloat *zs1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat));
-  dfloat *zt1 = (dfloat*) calloc(Nq*cubNq, sizeof(dfloat));
+  memory<dfloat> xr1(Nq*cubNq);
+  memory<dfloat> xs1(Nq*cubNq);
+  memory<dfloat> xt1(Nq*cubNq);
+  memory<dfloat> yr1(Nq*cubNq);
+  memory<dfloat> ys1(Nq*cubNq);
+  memory<dfloat> yt1(Nq*cubNq);
+  memory<dfloat> zr1(Nq*cubNq);
+  memory<dfloat> zs1(Nq*cubNq);
+  memory<dfloat> zt1(Nq*cubNq);
 
   //geometric data for quadrature
   for(dlong e=0;e<Nelements;++e){ /* for each element */
@@ -232,11 +224,7 @@ void meshHex3D::CubatureSetup(){
           /* compute geometric factors for affine coordinate transform*/
           dfloat J = xr*(ys*zt-zs*yt) - yr*(xs*zt-zs*xt) + zr*(xs*yt-ys*xt);
 
-          if(J<1e-8) {
-            stringstream ss;
-            ss << "Negative J found at element " << e << "\n";
-            LIBP_ABORT(ss.str())
-          }
+          LIBP_ABORT("Negative J found at element " << e, J<1e-8);
 
           dfloat rx =  (ys*zt - zs*yt)/J, ry = -(xs*zt - zs*xt)/J, rz =  (xs*yt - ys*xt)/J;
           dfloat sx = -(yr*zt - zr*yt)/J, sy =  (xr*zt - zr*xt)/J, sz = -(xr*yt - yr*xt)/J;
@@ -270,7 +258,9 @@ void meshHex3D::CubatureSetup(){
           cubggeo[base + cubNp*G11ID] = JW*(sx*sx + sy*sy + sz*sz);
           cubggeo[base + cubNp*G12ID] = JW*(sx*tx + sy*ty + sz*tz);
           cubggeo[base + cubNp*G22ID] = JW*(tx*tx + ty*ty + tz*tz);
-          cubggeo[base + cubNp*GWJID] = JW;
+
+          base = cubNp*e + i + j*cubNq + k*cubNq*cubNq;
+          cubwJ[base] = JW;
         }
       }
     }
@@ -360,29 +350,10 @@ void meshHex3D::CubatureSetup(){
   }
 
 
-  o_cubvgeo =
-    platform.malloc(Nelements*Nvgeo*cubNp*sizeof(dfloat),
-        cubvgeo);
-
-  o_cubsgeo =
-    platform.malloc(Nelements*Nfaces*cubNq*cubNq*Nsgeo*sizeof(dfloat),
-        cubsgeo);
-
-  o_cubggeo =
-    platform.malloc(Nelements*Nggeo*cubNp*sizeof(dfloat),
-        cubggeo);
-
-  free(xre); free(xse); free(xte);
-  free(yre); free(yse); free(yte);
-  free(zre); free(zse); free(zte);
-  free(xre1); free(xse1); free(xte1);
-  free(yre1); free(yse1); free(yte1);
-  free(zre1); free(zse1); free(zte1);
-  free(xre2); free(xse2); free(xte2);
-  free(yre2); free(yse2); free(yte2);
-  free(zre2); free(zse2); free(zte2);
-
-  free(xr1); free(xs1); free(xt1);
-  free(yr1); free(ys1); free(yt1);
-  free(zr1); free(zs1); free(zt1);
+  o_cubwJ = platform.malloc<dfloat>(Nelements*cubNp, cubwJ);
+  o_cubvgeo = platform.malloc<dfloat>(Nelements*Nvgeo*cubNp, cubvgeo);
+  o_cubsgeo = platform.malloc<dfloat>(Nelements*Nfaces*cubNq*cubNq*Nsgeo, cubsgeo);
+  o_cubggeo = platform.malloc<dfloat>(Nelements*Nggeo*cubNp, cubggeo);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshCubatureSetupQuad2D.cpp b/libs/mesh/meshCubatureSetupQuad2D.cpp
index 10989098c..e806600f2 100644
--- a/libs/mesh/meshCubatureSetupQuad2D.cpp
+++ b/libs/mesh/meshCubatureSetupQuad2D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,16 +25,10 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
-#include "mesh/mesh3D.hpp"
 
-void meshQuad3D::CubatureSetup(){
-  mesh_t *mesh_p = (mesh_t*) this;
-  meshQuad2D* trimesh = (meshQuad2D*) mesh_p;
-  trimesh->meshQuad2D::CubatureSetup();
-}
+namespace libp {
 
-void meshQuad2D::CubatureSetup(){
+void mesh_t::CubatureSetupQuad2D(){
 
   /* Quadrature data */
   cubN = N+1;
@@ -44,24 +38,19 @@ void meshQuad2D::CubatureSetup(){
   intNfp = cubNq;
 
   // cubN+1 point Gauss-Legendre quadrature
-  cubr = (dfloat *) malloc(cubNq*sizeof(dfloat));
-  cubw = (dfloat *) malloc(cubNq*sizeof(dfloat));
   JacobiGQ(0, 0, cubN, cubr, cubw);
 
   // GLL to GL interpolation matrix
-  cubInterp = (dfloat *) malloc(Nq*cubNq*sizeof(dfloat));
-  InterpolationMatrix1D(N, Nq, r, cubNq, cubr, cubInterp); //uses the fact that r = gllz for 1:Nq
+  InterpolationMatrix1D(N, gllz, cubr, cubInterp);
 
   //cubature project cubProject = cubInterp^T
-  cubProject = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat));
-  matrixTranspose(cubNq, Nq, cubInterp, Nq, cubProject, cubNq);
+  cubProject.malloc(cubNq*Nq);
+  linAlg_t::matrixTranspose(cubNq, Nq, cubInterp, Nq, cubProject, cubNq);
 
   //cubature derivates matrix, cubD: differentiate on cubature nodes
-  cubD = (dfloat *) malloc(cubNq*cubNq*sizeof(dfloat));
-  Dmatrix1D(cubN, cubNq, cubr, cubNq, cubr, cubD);
+  Dmatrix1D(cubN, cubr, cubr, cubD);
 
   // weak cubature derivative cubPDT = cubProject * cubD^T
-  cubPDT  = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat));
   CubatureWeakDmatrix1D(Nq, cubNq, cubProject, cubD, cubPDT);
 
   // add compile time constants to kernels
@@ -72,42 +61,38 @@ void meshQuad2D::CubatureSetup(){
   props["defines/" "p_cubNfp"]= cubNfp;
 
   // build transposes (we hold matrices as column major on device)
-  dfloat *cubProjectT = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat));
-  dfloat *cubInterpT   = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat));
-  matrixTranspose(cubNq, Nq, cubInterp, Nq, cubInterpT, cubNq);
-  matrixTranspose(Nq, cubNq, cubProject, cubNq, cubProjectT, Nq);
+  memory<dfloat> cubProjectT(cubNq*Nq);
+  memory<dfloat> cubInterpT(cubNq*Nq);
+  linAlg_t::matrixTranspose(cubNq, Nq, cubInterp, Nq, cubInterpT, cubNq);
+  linAlg_t::matrixTranspose(Nq, cubNq, cubProject, cubNq, cubProjectT, Nq);
 
-  dfloat *cubPDTT     = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat));
-  matrixTranspose(Nq, cubNq, cubPDT, cubNq, cubPDTT, Nq);
+  memory<dfloat> cubPDTT(cubNq*Nq);
+  linAlg_t::matrixTranspose(Nq, cubNq, cubPDT, cubNq, cubPDTT, Nq);
 
-  o_cubInterp   = platform.malloc(Nq*cubNq*sizeof(dfloat), cubInterpT);
-  o_cubProject = platform.malloc(Nq*cubNq*sizeof(dfloat), cubProjectT);
+  o_cubInterp  = platform.malloc<dfloat>(Nq*cubNq, cubInterpT);
+  o_cubProject = platform.malloc<dfloat>(Nq*cubNq, cubProjectT);
 
-  o_cubPDT = platform.malloc(Nq*cubNq*sizeof(dfloat), cubPDTT);
-  o_cubD = platform.malloc(cubNq*cubNq*sizeof(dfloat), cubD);
+  o_cubPDT = platform.malloc<dfloat>(Nq*cubNq, cubPDTT);
+  o_cubD   = platform.malloc<dfloat>(cubNq*cubNq, cubD);
 
   o_intInterp = o_cubInterp;
   o_intLIFT = o_cubProject;
 
-  free(cubPDTT);
-  free(cubProjectT);
-  free(cubInterpT);
-
-  cubvgeo = (dfloat*) calloc(Nelements*Nvgeo*cubNp, sizeof(dfloat));
-  cubggeo = (dfloat*) calloc(Nelements*Nggeo*cubNp, sizeof(dfloat));
-
-  cubsgeo = (dfloat*) calloc(Nelements*Nsgeo*cubNq*Nfaces, sizeof(dfloat));
+  cubwJ.malloc(Nelements*cubNp);
+  cubvgeo.malloc(Nelements*Nvgeo*cubNp);
+  cubggeo.malloc(Nelements*Nggeo*cubNp);
+  cubsgeo.malloc(Nelements*Nsgeo*cubNq*Nfaces);
 
   //temp arrays
-  dfloat *xre = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *xse = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *yre = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *yse = (dfloat*) calloc(Np, sizeof(dfloat));
+  memory<dfloat> xre(Np);
+  memory<dfloat> xse(Np);
+  memory<dfloat> yre(Np);
+  memory<dfloat> yse(Np);
 
-  dfloat *xre1 = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat));
-  dfloat *xse1 = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat));
-  dfloat *yre1 = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat));
-  dfloat *yse1 = (dfloat*) calloc(cubNq*Nq, sizeof(dfloat));
+  memory<dfloat> xre1(cubNq*Nq);
+  memory<dfloat> xse1(cubNq*Nq);
+  memory<dfloat> yre1(cubNq*Nq);
+  memory<dfloat> yse1(cubNq*Nq);
 
   //geometric data for quadrature
   for(dlong e=0;e<Nelements;++e){ /* for each element */
@@ -158,11 +143,8 @@ void meshQuad2D::CubatureSetup(){
         /* compute geometric factors for affine coordinate transform*/
         dfloat J = xr*ys - xs*yr;
 
-        if(J<1e-8) {
-          stringstream ss;
-          ss << "Negative J found at element " << e << "\n";
-          LIBP_ABORT(ss.str())
-        }
+        LIBP_ABORT("Negative J found at element " << e, J<1e-8);
+
         dfloat rx =  ys/J;
         dfloat ry = -xs/J;
         dfloat sx = -yr/J;
@@ -185,7 +167,8 @@ void meshQuad2D::CubatureSetup(){
         cubggeo[base + cubNp*G00ID] = JW*(rx*rx + ry*ry);
         cubggeo[base + cubNp*G01ID] = JW*(rx*sx + ry*sy);
         cubggeo[base + cubNp*G11ID] = JW*(sx*sx + sy*sy);
-        cubggeo[base + cubNp*GWJID] = JW;
+
+        cubwJ[cubNp*e + i + j*cubNq] = JW;
       }
     }
 
@@ -239,10 +222,10 @@ void meshQuad2D::CubatureSetup(){
     }
   }
 
-  o_cubvgeo = platform.malloc(Nelements*Nvgeo*cubNp*sizeof(dfloat), cubvgeo);
-  o_cubggeo = platform.malloc(Nelements*Nggeo*cubNp*sizeof(dfloat), cubggeo);
-  o_cubsgeo = platform.malloc(Nelements*Nfaces*cubNq*Nsgeo*sizeof(dfloat), cubsgeo);
-
-  free(xre); free(xse); free(yre); free(yse);
-  free(xre1); free(xse1); free(yre1); free(yse1);
+  o_cubwJ   = platform.malloc<dfloat>(Nelements*cubNp, cubwJ);
+  o_cubvgeo = platform.malloc<dfloat>(Nelements*Nvgeo*cubNp, cubvgeo);
+  o_cubggeo = platform.malloc<dfloat>(Nelements*Nggeo*cubNp, cubggeo);
+  o_cubsgeo = platform.malloc<dfloat>(Nelements*Nfaces*cubNq*Nsgeo, cubsgeo);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshCubatureSetupTet3D.cpp b/libs/mesh/meshCubatureSetupTet3D.cpp
index 52817c813..a24b9f706 100644
--- a/libs/mesh/meshCubatureSetupTet3D.cpp
+++ b/libs/mesh/meshCubatureSetupTet3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,41 +25,39 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
 
-void meshTet3D::CubatureSetup(){
+namespace libp {
+
+void mesh_t::CubatureSetupTet3D(){
 
   /* Cubature data */
   cubN = 2*N; //cubature order
-  CubatureNodesTet3D(cubN, &cubNp, &cubr, &cubs, &cubt, &cubw);
+  CubatureNodesTet3D(cubN, cubNp, cubr, cubs, cubt, cubw);
 
-  cubInterp = (dfloat *) malloc(Np*cubNp*sizeof(dfloat));
-  InterpolationMatrixTet3D(N, Np, r, s, t, cubNp, cubr, cubs, cubt, cubInterp);
+  InterpolationMatrixTet3D(N, r, s, t, cubr, cubs, cubt, cubInterp);
 
   //cubature project cubProject = M^{-1} * cubInterp^T
   // Defined such that cubProject * cubW * cubInterp = Identity
-  cubProject = (dfloat*) calloc(cubNp*Np, sizeof(dfloat));
-  CubaturePmatrixTet3D(N, Np, r, s, t, cubNp, cubr, cubs, cubt, cubProject);
+  CubaturePmatrixTet3D(N, r, s, t, cubr, cubs, cubt, cubProject);
 
   //cubature derivates matrices, cubD: differentiate on cubature nodes
   // we dont use cubD on Tris/Tets  so skip computing
 
   // Instead, it's cheaper to:
   // make weak cubature derivatives cubPDT = cubProject * cubD^T
-  cubPDT  = (dfloat*) calloc(3*cubNp*Np, sizeof(dfloat));
+  CubatureWeakDmatricesTet3D(N, r, s, t,
+                             cubr, cubs, cubt,
+                             cubPDT);
   cubPDrT = cubPDT + 0*cubNp*Np;
   cubPDsT = cubPDT + 1*cubNp*Np;
   cubPDtT = cubPDT + 2*cubNp*Np;
-  CubatureWeakDmatricesTet3D(N, Np, r, s, t, cubNp, cubr, cubs, cubt,
-                             cubPDrT, cubPDsT, cubPDtT);
 
   // Surface cubature nodes
-  CubatureNodesTri2D(cubN, &intNfp, &intr, &ints, &intw);
+  CubatureNodesTri2D(cubN, intNfp, intr, ints, intw);
   cubNfp = intNfp;
 
-  intInterp = (dfloat*) calloc(intNfp*Nfaces*Nfp, sizeof(dfloat));
-  intLIFT = (dfloat*) calloc(Nfaces*intNfp*Np, sizeof(dfloat));
-  CubatureSurfaceMatricesTet3D(N, Np, r, s, t, faceNodes, intNfp, intr, ints, intw,
+  CubatureSurfaceMatricesTet3D(N, r, s, t, faceNodes,
+                               intr, ints, intw,
                                intInterp, intLIFT);
 
   // add compile time constants to kernels
@@ -70,10 +68,10 @@ void meshTet3D::CubatureSetup(){
   props["defines/" "p_cubNfp"]= cubNfp;
 
   // build transposes (we hold matrices as column major on device)
-  dfloat *cubProjectT = (dfloat*) calloc(cubNp*Np, sizeof(dfloat));
-  dfloat *cubInterpT   = (dfloat*) calloc(cubNp*Np, sizeof(dfloat));
-  matrixTranspose(cubNp, Np, cubInterp, Np, cubInterpT, cubNp);
-  matrixTranspose(Np, cubNp, cubProject, cubNp, cubProjectT, Np);
+  memory<dfloat> cubProjectT(cubNp*Np);
+  memory<dfloat> cubInterpT(cubNp*Np);
+  linAlg_t::matrixTranspose(cubNp, Np, cubInterp, Np, cubInterpT, cubNp);
+  linAlg_t::matrixTranspose(Np, cubNp, cubProject, cubNp, cubProjectT, Np);
 
   //pre-multiply cubProject by W on device
   for(int n=0;n<cubNp;++n){
@@ -82,13 +80,13 @@ void meshTet3D::CubatureSetup(){
     }
   }
 
-  dfloat *cubPDTT = (dfloat*) calloc(3*cubNp*Np, sizeof(dfloat));
-  dfloat *cubPDrTT = cubPDTT + 0*cubNp*Np;
-  dfloat *cubPDsTT = cubPDTT + 1*cubNp*Np;
-  dfloat *cubPDtTT = cubPDTT + 2*cubNp*Np;
-  matrixTranspose(Np, cubNp, cubPDrT, cubNp, cubPDrTT, Np);
-  matrixTranspose(Np, cubNp, cubPDsT, cubNp, cubPDsTT, Np);
-  matrixTranspose(Np, cubNp, cubPDtT, cubNp, cubPDtTT, Np);
+  memory<dfloat> cubPDTT(3*cubNp*Np);
+  memory<dfloat> cubPDrTT = cubPDTT + 0*cubNp*Np;
+  memory<dfloat> cubPDsTT = cubPDTT + 1*cubNp*Np;
+  memory<dfloat> cubPDtTT = cubPDTT + 2*cubNp*Np;
+  linAlg_t::matrixTranspose(Np, cubNp, cubPDrT, cubNp, cubPDrTT, Np);
+  linAlg_t::matrixTranspose(Np, cubNp, cubPDsT, cubNp, cubPDsTT, Np);
+  linAlg_t::matrixTranspose(Np, cubNp, cubPDtT, cubNp, cubPDtTT, Np);
 
   //pre-multiply cubPDT by W on device
   for(int n=0;n<cubNp;++n){
@@ -100,27 +98,23 @@ void meshTet3D::CubatureSetup(){
   }
 
   // build surface integration matrix transposes
-  dfloat *intLIFTT = (dfloat*) calloc(Np*Nfaces*intNfp, sizeof(dfloat));
-  dfloat *intInterpT = (dfloat*) calloc(Nfp*Nfaces*intNfp, sizeof(dfloat));
-  matrixTranspose(Np, Nfaces*intNfp, intLIFT, Nfaces*intNfp, intLIFTT, Np);
-  matrixTranspose(Nfaces*intNfp, Nfp, intInterp, Nfp, intInterpT, Nfaces*intNfp);
-
+  memory<dfloat> intLIFTT(Np*Nfaces*intNfp);
+  memory<dfloat> intInterpT(Nfp*Nfaces*intNfp);
+  linAlg_t::matrixTranspose(Np, Nfaces*intNfp, intLIFT, Nfaces*intNfp, intLIFTT, Np);
+  linAlg_t::matrixTranspose(Nfaces*intNfp, Nfp, intInterp, Nfp, intInterpT, Nfaces*intNfp);
 
-  o_cubvgeo = o_vgeo;// dummy
-  o_cubsgeo = o_sgeo; //dummy cubature geo factors
+  o_cubInterp  = platform.malloc<dfloat>(Np*cubNp, cubInterpT);
+  o_cubProject = platform.malloc<dfloat>(Np*cubNp, cubProjectT);
 
-  o_cubInterp = platform.malloc(Np*cubNp*sizeof(dfloat), cubInterpT);
-  o_cubProject = platform.malloc(Np*cubNp*sizeof(dfloat), cubProjectT);
+  o_cubPDT = platform.malloc<dfloat>(3*Np*cubNp, cubPDTT);
 
-  o_cubPDT = platform.malloc(3*Np*cubNp*sizeof(dfloat), cubPDTT);
-  o_cubD = o_cubPDT; //dummy
+  o_intInterp = platform.malloc<dfloat>(Nfp*Nfaces*intNfp, intInterpT);
+  o_intLIFT   = platform.malloc<dfloat>(Np*Nfaces*intNfp, intLIFTT);
 
-  o_intInterp = platform.malloc(Nfp*Nfaces*intNfp*sizeof(dfloat), intInterpT);
-  o_intLIFT = platform.malloc(Np*Nfaces*intNfp*sizeof(dfloat), intLIFTT);
-
-  free(cubPDTT);
-  free(cubProjectT);
-  free(cubInterpT);
-  free(intLIFTT);
-  free(intInterpT);
+  o_cubwJ = o_wJ;
+  o_cubvgeo = o_vgeo;
+  o_cubggeo = o_ggeo;
+  o_cubsgeo = o_sgeo;
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshCubatureSetupTri2D.cpp b/libs/mesh/meshCubatureSetupTri2D.cpp
index a342f5e0f..e7c55785b 100644
--- a/libs/mesh/meshCubatureSetupTri2D.cpp
+++ b/libs/mesh/meshCubatureSetupTri2D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,50 +25,40 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
-#include "mesh/mesh3D.hpp"
 
-void meshTri3D::CubatureSetup(){
-  mesh_t *mesh_p = (mesh_t*) this;
-  meshTri2D* trimesh = (meshTri2D*) mesh_p;
-  trimesh->meshTri2D::CubatureSetup();
-}
+namespace libp {
 
-void meshTri2D::CubatureSetup(){
+void mesh_t::CubatureSetupTri2D(){
 
   /* Cubature data */
   cubN = 2*N; //cubature order
-  CubatureNodesTri2D(cubN, &cubNp, &cubr, &cubs, &cubw);
+  CubatureNodesTri2D(cubN, cubNp, cubr, cubs, cubw);
 
-  cubInterp = (dfloat *) malloc(Np*cubNp*sizeof(dfloat));
-  InterpolationMatrixTri2D(N, Np, r, s, cubNp, cubr, cubs, cubInterp);
+  InterpolationMatrixTri2D(N, r, s, cubr, cubs, cubInterp);
 
   //cubature project cubProject = M^{-1} * cubInterp^T
   // Defined such that cubProject * cubW * cubInterp = Identity
-  cubProject = (dfloat*) calloc(cubNp*Np, sizeof(dfloat));
-  CubaturePmatrixTri2D(N, Np, r, s, cubNp, cubr, cubs, cubProject);
+  CubaturePmatrixTri2D(N, r, s, cubr, cubs, cubProject);
 
   //cubature derivates matrices, cubD: differentiate on cubature nodes
   // we dont use cubD on Tris/Tets  so skip computing
 
   // Instead, it's cheaper to:
   // make weak cubature derivatives cubPDT = cubProject * cubD^T
-  cubPDT  = (dfloat*) calloc(2*cubNp*Np, sizeof(dfloat));
+  CubatureWeakDmatricesTri2D(N, r, s,
+                             cubr, cubs,
+                             cubPDT);
   cubPDrT = cubPDT + 0*cubNp*Np;
   cubPDsT = cubPDT + 1*cubNp*Np;
-  CubatureWeakDmatricesTri2D(N, Np, r, s, cubNp, cubr, cubs, cubPDrT, cubPDsT);
 
   // cubN+1 point Gauss-Legendre quadrature for surface integrals
   cubNq  = cubN+1;
   cubNfp = cubN+1;
   intNfp = cubN+1;
-  intr = (dfloat *) malloc(cubNfp*sizeof(dfloat));
-  intw = (dfloat *) malloc(cubNfp*sizeof(dfloat));
   JacobiGQ(0, 0, cubN, intr, intw);
 
-  intInterp = (dfloat*) calloc(intNfp*Nfaces*Nfp, sizeof(dfloat));
-  intLIFT = (dfloat*) calloc(Nfaces*intNfp*Np, sizeof(dfloat));
-  CubatureSurfaceMatricesTri2D(N, Np, r, s, faceNodes, intNfp, intr, intw,
+  CubatureSurfaceMatricesTri2D(N, r, s, faceNodes,
+                               intr, intw,
                                intInterp, intLIFT);
 
   // add compile time constants to kernels
@@ -79,10 +69,10 @@ void meshTri2D::CubatureSetup(){
   props["defines/" "p_cubNfp"]= cubNfp;
 
   // build transposes (we hold matrices as column major on device)
-  dfloat *cubProjectT = (dfloat*) calloc(cubNp*Np, sizeof(dfloat));
-  dfloat *cubInterpT   = (dfloat*) calloc(cubNp*Np, sizeof(dfloat));
-  matrixTranspose(cubNp, Np, cubInterp, Np, cubInterpT, cubNp);
-  matrixTranspose(Np, cubNp, cubProject, cubNp, cubProjectT, Np);
+  memory<dfloat> cubProjectT(cubNp*Np);
+  memory<dfloat> cubInterpT(cubNp*Np);
+  linAlg_t::matrixTranspose(cubNp, Np, cubInterp, Np, cubInterpT, cubNp);
+  linAlg_t::matrixTranspose(Np, cubNp, cubProject, cubNp, cubProjectT, Np);
 
   //pre-multiply cubProject by W on device
   for(int n=0;n<cubNp;++n){
@@ -91,11 +81,11 @@ void meshTri2D::CubatureSetup(){
     }
   }
 
-  dfloat *cubPDTT = (dfloat*) calloc(2*cubNp*Np, sizeof(dfloat));
-  dfloat *cubPDrTT = cubPDTT + 0*cubNp*Np;
-  dfloat *cubPDsTT = cubPDTT + 1*cubNp*Np;
-  matrixTranspose(Np, cubNp, cubPDrT, cubNp, cubPDrTT, Np);
-  matrixTranspose(Np, cubNp, cubPDsT, cubNp, cubPDsTT, Np);
+  memory<dfloat> cubPDTT(2*cubNp*Np);
+  memory<dfloat> cubPDrTT = cubPDTT + 0*cubNp*Np;
+  memory<dfloat> cubPDsTT = cubPDTT + 1*cubNp*Np;
+  linAlg_t::matrixTranspose(Np, cubNp, cubPDrT, cubNp, cubPDrTT, Np);
+  linAlg_t::matrixTranspose(Np, cubNp, cubPDsT, cubNp, cubPDsTT, Np);
 
   //pre-multiply cubPDT by W on device
   for(int n=0;n<cubNp;++n){
@@ -106,26 +96,23 @@ void meshTri2D::CubatureSetup(){
   }
 
   // build surface integration matrix transposes
-  dfloat *intLIFTT = (dfloat*) calloc(Np*Nfaces*intNfp, sizeof(dfloat));
-  dfloat *intInterpT = (dfloat*) calloc(Nfp*Nfaces*intNfp, sizeof(dfloat));
-  matrixTranspose(Np, Nfaces*intNfp, intLIFT, Nfaces*intNfp, intLIFTT, Np);
-  matrixTranspose(Nfaces*intNfp, Nfp, intInterp, Nfp, intInterpT, Nfaces*intNfp);
-
-  o_cubvgeo = o_vgeo;// dummy
-  o_cubsgeo = o_sgeo; //dummy cubature geo factors
+  memory<dfloat> intLIFTT(Np*Nfaces*intNfp);
+  memory<dfloat> intInterpT(Nfp*Nfaces*intNfp);
+  linAlg_t::matrixTranspose(Np, Nfaces*intNfp, intLIFT, Nfaces*intNfp, intLIFTT, Np);
+  linAlg_t::matrixTranspose(Nfaces*intNfp, Nfp, intInterp, Nfp, intInterpT, Nfaces*intNfp);
 
-  o_cubInterp   = platform.malloc(Np*cubNp*sizeof(dfloat), cubInterpT);
-  o_cubProject = platform.malloc(Np*cubNp*sizeof(dfloat), cubProjectT);
+  o_cubInterp  = platform.malloc<dfloat>(Np*cubNp, cubInterpT);
+  o_cubProject = platform.malloc<dfloat>(Np*cubNp, cubProjectT);
 
-  o_cubPDT = platform.malloc(2*cubNp*Np*sizeof(dfloat), cubPDTT);
-  o_cubD = o_cubPDT; //dummy
+  o_cubPDT = platform.malloc<dfloat>(2*cubNp*Np, cubPDTT);
 
-  o_intInterp = platform.malloc(Nfp*Nfaces*intNfp*sizeof(dfloat), intInterpT);
-  o_intLIFT = platform.malloc(Np*Nfaces*intNfp*sizeof(dfloat), intLIFTT);
+  o_intInterp = platform.malloc<dfloat>(Nfp*Nfaces*intNfp, intInterpT);
+  o_intLIFT   = platform.malloc<dfloat>(Np*Nfaces*intNfp, intLIFTT);
 
-  free(cubPDTT);
-  free(cubProjectT);
-  free(cubInterpT);
-  free(intLIFTT);
-  free(intInterpT);
+  o_cubwJ = o_wJ;
+  o_cubvgeo = o_vgeo;
+  o_cubggeo = o_ggeo;
+  o_cubsgeo = o_sgeo;
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshParallelGatherScatterSetup.cpp b/libs/mesh/meshGatherScatterSetup.cpp
similarity index 53%
rename from libs/mesh/meshParallelGatherScatterSetup.cpp
rename to libs/mesh/meshGatherScatterSetup.cpp
index 88e034803..dcb978041 100644
--- a/libs/mesh/meshParallelGatherScatterSetup.cpp
+++ b/libs/mesh/meshGatherScatterSetup.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -24,33 +24,72 @@ SOFTWARE.
 
 */
 
+
 #include "mesh.hpp"
 
-void mesh_t::ParallelGatherScatterSetup() {
+namespace libp {
+
+void mesh_t::GatherScatterSetup() {
 
-  dlong Ntotal = Np*Nelements;
+  dlong Ntotal = Nverts*(Nelements+totalHaloPairs);
 
-  int verbose = 0;
-  ogs = ogs_t::Setup(Ntotal, globalIds, comm, verbose, platform);
+  memory<int> minRank(Ntotal);
+  memory<int> maxRank(Ntotal);
 
-  //use the gs to find what nodes are local to this rank
-  int *minRank = (int *) calloc(Ntotal,sizeof(int));
-  int *maxRank = (int *) calloc(Ntotal,sizeof(int));
   for (dlong i=0;i<Ntotal;i++) {
     minRank[i] = rank;
     maxRank[i] = rank;
   }
 
-  ogs->GatherScatter(minRank, ogs_int, ogs_min, ogs_sym); //minRank[n] contains the smallest rank taking part in the gather of node n
-  ogs->GatherScatter(maxRank, ogs_int, ogs_max, ogs_sym); //maxRank[n] contains the largest rank taking part in the gather of node n
+  hlong gatherChange = 1;
+
+  // keep comparing numbers on positive and negative traces until convergence
+  while(gatherChange>0){
+
+    // reset change counter
+    gatherChange = 0;
+
+    // send halo data and recv into extension of buffer
+    halo.Exchange(minRank, Nverts);
+    halo.Exchange(maxRank, Nverts);
+
+    // compare trace vertices
+    #pragma omp parallel for collapse(2)
+    for(dlong e=0;e<Nelements;++e){
+      for(int n=0;n<Nfaces*NfaceVertices;++n){
+        dlong id  = e*Nfaces*NfaceVertices + n;
+        dlong idM = VmapM[id];
+        dlong idP = VmapP[id];
+
+        int minRankM = minRank[idM];
+        int minRankP = minRank[idP];
+
+        int maxRankM = maxRank[idM];
+        int maxRankP = maxRank[idP];
+
+        if(minRankP<minRankM){
+          gatherChange=1;
+          minRank[idM] = minRankP;
+        }
+
+        if(maxRankP>maxRankM){
+          gatherChange=1;
+          maxRank[idM] = maxRankP;
+        }
+      }
+    }
+
+    // sum up changes
+    comm.Allreduce(gatherChange);
+  }
 
   // count elements that contribute to global C0 gather-scatter
   dlong globalCount = 0;
   dlong localCount = 0;
   for(dlong e=0;e<Nelements;++e){
     int isHalo = 0;
-    for(int n=0;n<Np;++n){
-      dlong id = e*Np+n;
+    for(int n=0;n<Nverts;++n){
+      dlong id = e*Nverts+n;
       if ((minRank[id]!=rank)||(maxRank[id]!=rank)) {
         isHalo = 1;
         break;
@@ -60,16 +99,16 @@ void mesh_t::ParallelGatherScatterSetup() {
     localCount += 1-isHalo;
   }
 
-  globalGatherElementList = (dlong*) calloc(globalCount, sizeof(dlong));
-  localGatherElementList  = (dlong*) calloc(localCount, sizeof(dlong));
+  globalGatherElementList.malloc(globalCount);
+  localGatherElementList.malloc(localCount);
 
   globalCount = 0;
   localCount = 0;
 
   for(dlong e=0;e<Nelements;++e){
     int isHalo = 0;
-    for(int n=0;n<Np;++n){
-      dlong id = e*Np+n;
+    for(int n=0;n<Nverts;++n){
+      dlong id = e*Nverts+n;
       if ((minRank[id]!=rank)||(maxRank[id]!=rank)) {
         isHalo = 1;
         break;
@@ -81,8 +120,13 @@ void mesh_t::ParallelGatherScatterSetup() {
       localGatherElementList[localCount++] = e;
     }
   }
-  //printf("local = %d, global = %d\n", localCount, globalCount);
 
   NglobalGatherElements = globalCount;
   NlocalGatherElements = localCount;
+
+  // send to device
+  o_globalGatherElementList = platform.malloc<dlong>(globalGatherElementList);
+  o_localGatherElementList = platform.malloc<dlong>(localGatherElementList);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshGeometricFactorsHex3D.cpp b/libs/mesh/meshGeometricFactorsHex3D.cpp
index a45f5aa8c..1aae6b50e 100644
--- a/libs/mesh/meshGeometricFactorsHex3D.cpp
+++ b/libs/mesh/meshGeometricFactorsHex3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,23 +25,73 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
 
-void meshHex3D::GeometricFactors(){
+namespace libp {
 
-  /* unified storage array for geometric factors */
+void mesh_t::GeometricFactorsHex3D(){
+
+  /*Set offsets*/
   Nvgeo = 12;
 
+  RXID  = 0;
+  RYID  = 1;
+  RZID  = 2;
+  SXID  = 3;
+  SYID  = 4;
+  SZID  = 5;
+  TXID  = 6;
+  TYID  = 7;
+  TZID  = 8;
+  JID   = 9;
+  JWID  = 10;
+  IJWID = 11;
+
+  props["defines/" "p_Nvgeo"]= Nvgeo;
+  props["defines/" "p_RXID"]= RXID;
+  props["defines/" "p_SXID"]= SXID;
+  props["defines/" "p_TXID"]= TXID;
+
+  props["defines/" "p_RYID"]= RYID;
+  props["defines/" "p_SYID"]= SYID;
+  props["defines/" "p_TYID"]= TYID;
+
+  props["defines/" "p_RZID"]= RZID;
+  props["defines/" "p_SZID"]= SZID;
+  props["defines/" "p_TZID"]= TZID;
+
+  props["defines/" "p_JID"]= JID;
+  props["defines/" "p_JWID"]= JWID;
+  props["defines/" "p_IJWID"]= IJWID;
+
+  /* unified storage array for geometric factors */
   /* note that we have volume geometric factors for each node */
-  vgeo = (dfloat*) calloc((Nelements+totalHaloPairs)*Nvgeo*Np, sizeof(dfloat));
+  vgeo.malloc((Nelements+totalHaloPairs)*Nvgeo*Np);
+
+  Nggeo = 6;
+
+  G00ID=0;
+  G01ID=1;
+  G02ID=2;
+  G11ID=3;
+  G12ID=4;
+  G22ID=5;
+
+  props["defines/" "p_Nggeo"]= Nggeo;
+  props["defines/" "p_G00ID"]= G00ID;
+  props["defines/" "p_G01ID"]= G01ID;
+  props["defines/" "p_G02ID"]= G02ID;
+  props["defines/" "p_G11ID"]= G11ID;
+  props["defines/" "p_G12ID"]= G12ID;
+  props["defines/" "p_G22ID"]= G22ID;
 
   /* number of second order geometric factors */
-  Nggeo = 7;
+  ggeo.malloc(Nelements*Nggeo*Np);
 
-  ggeo = (dfloat*) calloc(Nelements*Nggeo*Np, sizeof(dfloat));
+  wJ.malloc(Nelements*Np);
 
   // dfloat minJ = 1e9, maxJ = -1e9, maxSkew = 0;
 
+  #pragma omp parallel for
   for(dlong e=0;e<Nelements;++e){ /* for each element */
 
     for(int k=0;k<Nq;++k){
@@ -83,17 +133,14 @@ void meshHex3D::GeometricFactors(){
           // maxSkew = mymax(maxSkew, ht/hr);
           // maxSkew = mymax(maxSkew, ht/hs);
 
-          if(J<1e-12) {
-            stringstream ss;
-            ss << "Negative J found at element " << e << "\n";
-            LIBP_ABORT(ss.str())
-          }
+          LIBP_ABORT("Negative J found at element " << e,
+                     J<1e-12);
 
           dfloat rx =  (ys*zt - zs*yt)/J, ry = -(xs*zt - zs*xt)/J, rz =  (xs*yt - ys*xt)/J;
           dfloat sx = -(yr*zt - zr*yt)/J, sy =  (xr*zt - zr*xt)/J, sz = -(xr*yt - yr*xt)/J;
           dfloat tx =  (yr*zs - zr*ys)/J, ty = -(xr*zs - zr*xs)/J, tz =  (xr*ys - yr*xs)/J;
 
-          dfloat JW = J*w[i]*w[j]*w[k];
+          dfloat JW = J*gllw[i]*gllw[j]*gllw[k];
 
           /* store geometric factors */
           vgeo[Nvgeo*Np*e + n + Np*RXID] = rx;
@@ -119,12 +166,20 @@ void meshHex3D::GeometricFactors(){
           ggeo[Nggeo*Np*e + n + Np*G11ID] = JW*(sx*sx + sy*sy + sz*sz);
           ggeo[Nggeo*Np*e + n + Np*G12ID] = JW*(sx*tx + sy*ty + sz*tz);
           ggeo[Nggeo*Np*e + n + Np*G22ID] = JW*(tx*tx + ty*ty + tz*tz);
-          ggeo[Nggeo*Np*e + n + Np*GWJID] = JW;
+
+          wJ[Np*e + n] = JW;
         }
       }
     }
   }
 
+  halo.Exchange(vgeo, Nvgeo*Np);
+
+  o_wJ   = platform.malloc<dfloat>(wJ);
+  o_vgeo = platform.malloc<dfloat>(vgeo);
+  o_ggeo = platform.malloc<dfloat>(ggeo);
+
+
   #if 0
     dfloat globalMinJ, globalMaxJ, globalMaxSkew;
 
@@ -135,6 +190,6 @@ void meshHex3D::GeometricFactors(){
     if(rank==0)
       printf("J in range [%g,%g] and max Skew = %g\n", globalMinJ, globalMaxJ, globalMaxSkew);
   #endif
-
-  halo->Exchange(vgeo, Nvgeo*Np, ogs_dfloat);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshGeometricFactorsQuad2D.cpp b/libs/mesh/meshGeometricFactorsQuad2D.cpp
index 343de3525..32d04377a 100644
--- a/libs/mesh/meshGeometricFactorsQuad2D.cpp
+++ b/libs/mesh/meshGeometricFactorsQuad2D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,20 +25,52 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
 
-void meshQuad2D::GeometricFactors(){
+namespace libp {
 
-  /* unified storage array for geometric factors */
+void mesh_t::GeometricFactorsQuad2D(){
+
+  /*Set offsets*/
   Nvgeo = 7;
 
+  RXID  = 0;
+  RYID  = 1;
+  SXID  = 2;
+  SYID  = 3;
+  JID   = 4;
+  JWID  = 5;
+  IJWID = 6;
+
+  props["defines/" "p_Nvgeo"]= Nvgeo;
+  props["defines/" "p_RXID"]= RXID;
+  props["defines/" "p_SXID"]= SXID;
+  props["defines/" "p_RYID"]= RYID;
+  props["defines/" "p_SYID"]= SYID;
+  props["defines/" "p_JID"]= JID;
+  props["defines/" "p_JWID"]= JWID;
+  props["defines/" "p_IJWID"]= IJWID;
+
+  /* unified storage array for geometric factors */
   /* note that we have volume geometric factors for each node */
-  vgeo = (dfloat*) calloc((Nelements+totalHaloPairs)*Nvgeo*Np, sizeof(dfloat));
+  vgeo.malloc((Nelements+totalHaloPairs)*Nvgeo*Np);
+
+  Nggeo = 3;
+
+  G00ID=0;
+  G01ID=1;
+  G11ID=2;
+
+  props["defines/" "p_Nggeo"]= Nggeo;
+  props["defines/" "p_G00ID"]= G00ID;
+  props["defines/" "p_G01ID"]= G01ID;
+  props["defines/" "p_G11ID"]= G11ID;
 
   /* number of second order geometric factors */
-  Nggeo = 4;
-  ggeo = (dfloat*) calloc(Nelements*Nggeo*Np, sizeof(dfloat));
+  ggeo.malloc(Nelements*Nggeo*Np);
 
+  wJ.malloc(Nelements*Np);
+
+  #pragma omp parallel for
   for(dlong e=0;e<Nelements;++e){ /* for each element */
     for(int j=0;j<Nq;++j){
       for(int i=0;i<Nq;++i){
@@ -63,16 +95,14 @@ void meshQuad2D::GeometricFactors(){
         /* compute geometric factors for affine coordinate transform*/
         dfloat J = xr*ys - xs*yr;
 
-        if(J<1e-8) {
-          stringstream ss;
-          ss << "Negative J found at element " << e << "\n";
-          LIBP_ABORT(ss.str())
-        }
+        LIBP_ABORT("Negative J found at element " << e,
+                   J<1e-8);
+
         dfloat rx =  ys/J;
         dfloat ry = -xs/J;
         dfloat sx = -yr/J;
         dfloat sy =  xr/J;
-        dfloat JW = J*w[i]*w[j];
+        dfloat JW = J*gllw[i]*gllw[j];
 
         /* store geometric factors */
         vgeo[Nvgeo*Np*e + n + Np*RXID] = rx;
@@ -87,10 +117,17 @@ void meshQuad2D::GeometricFactors(){
         ggeo[Nggeo*Np*e + n + Np*G00ID] = JW*(rx*rx + ry*ry);
         ggeo[Nggeo*Np*e + n + Np*G01ID] = JW*(rx*sx + ry*sy);
         ggeo[Nggeo*Np*e + n + Np*G11ID] = JW*(sx*sx + sy*sy);
-        ggeo[Nggeo*Np*e + n + Np*GWJID] = JW;
+
+        wJ[Np*e + n] = JW;
       }
     }
   }
 
-  halo->Exchange(vgeo, Nvgeo*Np, ogs_dfloat);
+  halo.Exchange(vgeo, Nvgeo*Np);
+
+  o_wJ   = platform.malloc<dfloat>(wJ);
+  o_vgeo = platform.malloc<dfloat>(vgeo);
+  o_ggeo = platform.malloc<dfloat>(ggeo);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshGeometricFactorsQuad3D.cpp b/libs/mesh/meshGeometricFactorsQuad3D.cpp
index 361a95102..9c20c240e 100644
--- a/libs/mesh/meshGeometricFactorsQuad3D.cpp
+++ b/libs/mesh/meshGeometricFactorsQuad3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,239 +25,284 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
+
+namespace libp {
 
 // custom geometric factors specialized for 3D quad on sphere
 
-void meshQuad3D::GeometricFactors(){
+void mesh_t::GeometricFactorsQuad3D(){
+
+  /*Set offsets*/
+  Nvgeo = 12;
+
+  RXID  = 0;
+  RYID  = 1;
+  RZID  = 2;
+  SXID  = 3;
+  SYID  = 4;
+  SZID  = 5;
+  TXID  = 6;
+  TYID  = 7;
+  TZID  = 8;
+  JID   = 9;
+  JWID  = 10;
+  IJWID = 11;
+
+  props["defines/" "p_Nvgeo"]= Nvgeo;
+  props["defines/" "p_RXID"]= RXID;
+  props["defines/" "p_SXID"]= SXID;
+  props["defines/" "p_TXID"]= TXID;
+
+  props["defines/" "p_RYID"]= RYID;
+  props["defines/" "p_SYID"]= SYID;
+  props["defines/" "p_TYID"]= TYID;
+
+  props["defines/" "p_RZID"]= RZID;
+  props["defines/" "p_SZID"]= SZID;
+  props["defines/" "p_TZID"]= TZID;
+
+  props["defines/" "p_JID"]= JID;
+  props["defines/" "p_JWID"]= JWID;
+  props["defines/" "p_IJWID"]= IJWID;
 
   /* unified storage array for geometric factors */
-  Nvgeo = 12; //
-
   /* note that we have volume geometric factors for each node */
-  vgeo = (dfloat*) calloc((Nelements+totalHaloPairs)*Nvgeo*Np, sizeof(dfloat));
-
-  cubvgeo = (dfloat*) calloc(Nelements*Nvgeo*cubNp, sizeof(dfloat));
-
-  // Can be computed on the fly
-  Nggeo = 7;
-  ggeo  = (dfloat *) calloc(Nelements*Np*Nggeo, sizeof(dfloat));
-
-  dfloat *cxr = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat));
-  dfloat *cxs = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat));
-  dfloat *cyr = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat));
-  dfloat *cys = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat));
-  dfloat *czr = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat));
-  dfloat *czs = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat));
-  dfloat *cx  = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat));
-  dfloat *cy  = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat));
-  dfloat *cz  = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat));
+  vgeo.malloc((Nelements+totalHaloPairs)*Nvgeo*Np);
+
+  Nggeo = 6;
+
+  G00ID=0;
+  G01ID=1;
+  G02ID=2;
+  G11ID=3;
+  G12ID=4;
+  G22ID=5;
+
+  props["defines/" "p_Nggeo"]= Nggeo;
+  props["defines/" "p_G00ID"]= G00ID;
+  props["defines/" "p_G01ID"]= G01ID;
+  props["defines/" "p_G02ID"]= G02ID;
+  props["defines/" "p_G11ID"]= G11ID;
+  props["defines/" "p_G12ID"]= G12ID;
+  props["defines/" "p_G22ID"]= G22ID;
+
+  /* number of second order geometric factors */
+  ggeo.malloc(Nelements*Nggeo*Np);
+
+  wJ.malloc(Nelements*Np);
+
+  // dfloat *cxr = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat));
+  // dfloat *cxs = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat));
+  // dfloat *cyr = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat));
+  // dfloat *cys = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat));
+  // dfloat *czr = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat));
+  // dfloat *czs = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat));
+  // dfloat *cx  = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat));
+  // dfloat *cy  = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat));
+  // dfloat *cz  = (dfloat*) calloc(cubNq*cubNq, sizeof(dfloat));
 
   for(int e=0;e<Nelements;++e){ /* for each element */
 
-    for(int n=0;n<cubNq*cubNq;++n){
-      cxr[n] = 0; cyr[n] = 0; czr[n] = 0;
-      cxs[n] = 0; cys[n] = 0; czs[n] = 0;
-      cx[n] = 0;  cy[n] = 0;  cz[n] = 0;
-    }
-
-
-  for(int j=0;j<Nq;++j){
-    for(int i=0;i<Nq;++i){
+    // for(int n=0;n<cubNq*cubNq;++n){
+    //   cxr[n] = 0; cyr[n] = 0; czr[n] = 0;
+    //   cxs[n] = 0; cys[n] = 0; czs[n] = 0;
+    //   cx[n] = 0;  cy[n] = 0;  cz[n] = 0;
+    // }
 
-      dfloat xij = x[i+j*Nq+e*Np];
-      dfloat yij = y[i+j*Nq+e*Np];
-      dfloat zij = z[i+j*Nq+e*Np];
 
-      dfloat xr = 0, yr = 0, zr = 0;
-      dfloat xs = 0, ys = 0, zs = 0;
+    for(int j=0;j<Nq;++j){
+      for(int i=0;i<Nq;++i){
 
-      for(int n=0;n<Nq;++n){
+        dfloat xij = x[i+j*Nq+e*Np];
+        dfloat yij = y[i+j*Nq+e*Np];
+        dfloat zij = z[i+j*Nq+e*Np];
 
-	dfloat Din = D[i*Nq+n];
-	dfloat Djn = D[j*Nq+n];
+        dfloat xr = 0, yr = 0, zr = 0;
+        dfloat xs = 0, ys = 0, zs = 0;
 
-	xr += Din*x[n+j*Nq+e*Np];
-	yr += Din*y[n+j*Nq+e*Np];
-	zr += Din*z[n+j*Nq+e*Np];
+        for(int n=0;n<Nq;++n){
 
-	xs += Djn*x[i+n*Nq+e*Np];
-	ys += Djn*y[i+n*Nq+e*Np];
-	zs += Djn*z[i+n*Nq+e*Np];
+          dfloat Din = D[i*Nq+n];
+          dfloat Djn = D[j*Nq+n];
 
+          xr += Din*x[n+j*Nq+e*Np];
+          yr += Din*y[n+j*Nq+e*Np];
+          zr += Din*z[n+j*Nq+e*Np];
+
+          xs += Djn*x[i+n*Nq+e*Np];
+          ys += Djn*y[i+n*Nq+e*Np];
+          zs += Djn*z[i+n*Nq+e*Np];
+        }
+
+        {
+          dfloat rx = ys*zij - zs*yij; // dXds x X
+          dfloat ry = zs*xij - xs*zij;
+          dfloat rz = xs*yij - ys*xij;
+
+          dfloat sx = zr*yij - yr*zij; // -dXdr x X
+          dfloat sy = xr*zij - zr*xij;
+          dfloat sz = yr*xij - xr*yij;
+
+          dfloat tx = yr*zs - zr*ys; // dXdr x dXds ~ X*|dXdr x dXds|/|X|
+          dfloat ty = zr*xs - xr*zs;
+          dfloat tz = xr*ys - yr*xs;
+
+          dfloat Gx = tx, Gy = ty, Gz = tz;
+
+          dfloat J = xij*tx + yij*ty + zij*tz;
+
+          LIBP_ABORT("Negative J found at element " << e << "x=" << xij << " y=" << yij << " z=" << zij,
+                     J<1e-8);
+
+          rx /= J;      sx /= J;      tx /= J;
+          ry /= J;      sy /= J;      ty /= J;
+          rz /= J;      sz /= J;      tz /= J;
+
+          // use this for "volume" Jacobian
+          dfloat Jnew = sqrt(Gx*Gx+Gy*Gy+Gz*Gz);  //(difference between actual Jacobian and sphere Jac)
+          J = Jnew;
+
+          LIBP_ABORT("Negative J found at element " << e << "x=" << xij << " y=" << yij << " z=" << zij,
+                     J<1e-8);
+          //    printf("before: grad r = %g,%g,%g\n", rx, ry, rz);
+        }
+
+        dfloat GG00 = xr*xr+yr*yr+zr*zr;
+        dfloat GG11 = xs*xs+ys*ys+zs*zs;
+        dfloat GG01 = xr*xs+yr*ys+zr*zs;
+        dfloat detGG = GG00*GG11 - GG01*GG01;
+
+        // are these tangential
+        dfloat rx = (xr*GG11-xs*GG01)/detGG;
+        dfloat ry = (yr*GG11-ys*GG01)/detGG;
+        dfloat rz = (zr*GG11-zs*GG01)/detGG;
+
+        dfloat sx = (-xr*GG01+xs*GG00)/detGG;
+        dfloat sy = (-yr*GG01+ys*GG00)/detGG;
+        dfloat sz = (-zr*GG01+zs*GG00)/detGG;
+
+        dfloat tx = yr*zs - zr*ys; // dXdr x dXds ~ X*|dXdr x dXds|/|X|
+        dfloat ty = zr*xs - xr*zs;
+        dfloat tz = xr*ys - yr*xs;
+
+        // use this for "volume" Jacobian
+        dfloat J = sqrt(tx*tx+ty*ty+tz*tz); // (difference between actual Jacobian and sphere Jac)
+
+        //  printf("after: grad r = %g,%g,%g\n", rx, ry, rz);
+
+        dfloat JW = J*gllw[i]*gllw[j];
+
+        /* store geometric factors */
+        int base = Nvgeo*Np*e + j*Nq + i;
+
+        vgeo[base + Np*RXID] = rx;
+        vgeo[base + Np*RYID] = ry;
+        vgeo[base + Np*RZID] = rz;
+        vgeo[base + Np*SXID] = sx;
+        vgeo[base + Np*SYID] = sy;
+        vgeo[base + Np*SZID] = sz;
+        vgeo[base + Np*TXID] = tx;
+        vgeo[base + Np*TYID] = ty;
+        vgeo[base + Np*TZID] = tz;
+        vgeo[base + Np*JID]  = J;
+        vgeo[base + Np*JWID] = JW;
+        vgeo[base + Np*IJWID] = 1./JW;
+
+        /* store second order geometric factors (can be computed on the fly, later!!!)*/
+        int gbase = Nggeo*Np*e + j*Nq + i;
+        ggeo[gbase + Np*G00ID] = JW*(rx*rx + ry*ry + rz*rz);
+        ggeo[gbase + Np*G01ID] = JW*(rx*sx + ry*sy + rz*sz);
+        ggeo[gbase + Np*G02ID] = JW*(rx*tx + ry*ty + rz*tz);
+        ggeo[gbase + Np*G11ID] = JW*(sx*sx + sy*sy + sz*sz);
+        ggeo[gbase + Np*G12ID] = JW*(sx*tx + sy*ty + sz*tz);
+        ggeo[gbase + Np*G22ID] = JW*(tx*tx + ty*ty + tz*tz);
+
+        wJ[Np*e + j*Nq + i] = JW;
+
+        // now do for cubvgeo
+        // 1. interpolate Jacobian matrix to cubature nodes
+        // for(int m=0;m<cubNq;++m){
+        //   for(int n=0;n<cubNq;++n){
+        //     dfloat cIni = cubInterp[n*Nq+i];
+        //     dfloat cImj = cubInterp[m*Nq+j];
+        //     cxr[n+m*cubNq] += cIni*cImj*xr;
+        //     cxs[n+m*cubNq] += cIni*cImj*xs;
+        //     cyr[n+m*cubNq] += cIni*cImj*yr;
+        //     cys[n+m*cubNq] += cIni*cImj*ys;
+        //     czr[n+m*cubNq] += cIni*cImj*zr;
+        //     czs[n+m*cubNq] += cIni*cImj*zs;
+        //     cx[n+m*cubNq] += cIni*cImj*xij;
+        //     cy[n+m*cubNq] += cIni*cImj*yij;
+        //     cz[n+m*cubNq] += cIni*cImj*zij;
+        //   }
+        // }
       }
-
-      {
-	dfloat rx = ys*zij - zs*yij; // dXds x X
-	dfloat ry = zs*xij - xs*zij;
-	dfloat rz = xs*yij - ys*xij;
-
-	dfloat sx = zr*yij - yr*zij; // -dXdr x X
-	dfloat sy = xr*zij - zr*xij;
-	dfloat sz = yr*xij - xr*yij;
-
-	dfloat tx = yr*zs - zr*ys; // dXdr x dXds ~ X*|dXdr x dXds|/|X|
-	dfloat ty = zr*xs - xr*zs;
-	dfloat tz = xr*ys - yr*xs;
-
-	dfloat Gx = tx, Gy = ty, Gz = tz;
-
-	dfloat J = xij*tx + yij*ty + zij*tz;
-
-	if(J<1e-8) {
-	  stringstream ss;
-	  ss << "Negative J found at element " << e << "x=" << xij << " y=" << yij << " z=" << zij << "\n";
-	  LIBP_ABORT(ss.str())
-	    }
-
-	rx /= J;      sx /= J;      tx /= J;
-	ry /= J;      sy /= J;      ty /= J;
-	rz /= J;      sz /= J;      tz /= J;
-
-	// use this for "volume" Jacobian
-	dfloat Jnew = sqrt(Gx*Gx+Gy*Gy+Gz*Gz);  //(difference between actual Jacobian and sphere Jac)
-	J = Jnew;
-
-	if(J<1e-8) {
-	  stringstream ss;
-	  ss << "Negative J found at element " << e << "x=" << xij << " y=" << yij << " z=" << zij << "\n";
-	  ss << "Negative J found at element " << e << "\n";
-	  LIBP_ABORT(ss.str())
-	    }
-	//    printf("before: grad r = %g,%g,%g\n", rx, ry, rz);
-      }
-
-      dfloat GG00 = xr*xr+yr*yr+zr*zr;
-  dfloat GG11 = xs*xs+ys*ys+zs*zs;
-  dfloat GG01 = xr*xs+yr*ys+zr*zs;
-  dfloat detGG = GG00*GG11 - GG01*GG01;
-
-  // are these tangential
-  dfloat rx = (xr*GG11-xs*GG01)/detGG;
-  dfloat ry = (yr*GG11-ys*GG01)/detGG;
-  dfloat rz = (zr*GG11-zs*GG01)/detGG;
-
-  dfloat sx = (-xr*GG01+xs*GG00)/detGG;
-  dfloat sy = (-yr*GG01+ys*GG00)/detGG;
-  dfloat sz = (-zr*GG01+zs*GG00)/detGG;
-
-  dfloat tx = yr*zs - zr*ys; // dXdr x dXds ~ X*|dXdr x dXds|/|X|
-  dfloat ty = zr*xs - xr*zs;
-  dfloat tz = xr*ys - yr*xs;
-
-  // use this for "volume" Jacobian
-  dfloat J = sqrt(tx*tx+ty*ty+tz*tz); // (difference between actual Jacobian and sphere Jac)
-
-  //  printf("after: grad r = %g,%g,%g\n", rx, ry, rz);
-
-  dfloat JW = J*w[i]*w[j];
-
-  /* store geometric factors */
-  int base = Nvgeo*Np*e + j*Nq + i;
-
-  vgeo[base + Np*RXID] = rx;
-  vgeo[base + Np*RYID] = ry;
-  vgeo[base + Np*RZID] = rz;
-  vgeo[base + Np*SXID] = sx;
-  vgeo[base + Np*SYID] = sy;
-  vgeo[base + Np*SZID] = sz;
-  vgeo[base + Np*TXID] = tx;
-  vgeo[base + Np*TYID] = ty;
-  vgeo[base + Np*TZID] = tz;
-  vgeo[base + Np*JID]  = J;
-  vgeo[base + Np*JWID] = JW;
-  vgeo[base + Np*IJWID] = 1./JW;
-
-  /* store second order geometric factors (can be computed on the fly, later!!!)*/
-  int gbase = Nggeo*Np*e + j*Nq + i;
-  ggeo[gbase + Np*G00ID] = JW*(rx*rx + ry*ry + rz*rz);
-  ggeo[gbase + Np*G01ID] = JW*(rx*sx + ry*sy + rz*sz);
-  ggeo[gbase + Np*G02ID] = JW*(rx*tx + ry*ty + rz*tz);
-
-  ggeo[gbase + Np*G11ID] = JW*(sx*sx + sy*sy + sz*sz);
-  ggeo[gbase + Np*G12ID] = JW*(sx*tx + sy*ty + sz*tz);
-
-  ggeo[gbase + Np*G22ID] = JW*(tx*tx + ty*ty + tz*tz);
-  ggeo[gbase + Np*GWJID] = JW;
-
-  // now do for cubvgeo
-  // 1. interpolate Jacobian matrix to cubature nodes
-  for(int m=0;m<cubNq;++m){
-    for(int n=0;n<cubNq;++n){
-      dfloat cIni = cubInterp[n*Nq+i];
-      dfloat cImj = cubInterp[m*Nq+j];
-      cxr[n+m*cubNq] += cIni*cImj*xr;
-      cxs[n+m*cubNq] += cIni*cImj*xs;
-      cyr[n+m*cubNq] += cIni*cImj*yr;
-      cys[n+m*cubNq] += cIni*cImj*ys;
-      czr[n+m*cubNq] += cIni*cImj*zr;
-      czs[n+m*cubNq] += cIni*cImj*zs;
-      cx[n+m*cubNq] += cIni*cImj*xij;
-      cy[n+m*cubNq] += cIni*cImj*yij;
-      cz[n+m*cubNq] += cIni*cImj*zij;
     }
-  }
-      }
-    }
-
-    for(int n=0;n<cubNq*cubNq;++n){
-
-      dfloat rx = cys[n]*cz[n] - czs[n]*cy[n]; // dXds x X
-      dfloat ry = czs[n]*cx[n] - cxs[n]*cz[n];
-      dfloat rz = cxs[n]*cy[n] - cys[n]*cx[n];
-
-      dfloat sx = czr[n]*cy[n] - cyr[n]*cz[n]; // -dXdr x X
-      dfloat sy = cxr[n]*cz[n] - czr[n]*cx[n];
-      dfloat sz = cyr[n]*cx[n] - cxr[n]*cy[n];
-
-      dfloat tx = cyr[n]*czs[n] - czr[n]*cys[n]; // dXdr x dXds ~ X*|dXdr x dXds|/|X|
-      dfloat ty = czr[n]*cxs[n] - cxr[n]*czs[n];
-      dfloat tz = cxr[n]*cys[n] - cyr[n]*cxs[n];
-
-      dfloat Gx = tx, Gy = ty, Gz = tz;
-
-      dfloat J = cx[n]*tx + cy[n]*ty + cz[n]*tz;
 
-      if(J<1e-8) {
-        stringstream ss;
-        ss << "Negative J found at element " << e << "\n";
-	//	ss << "Negative J found at element " << e << "x=" << xij << " y=" << yij << " z=" << zij << "\n";
-        LIBP_ABORT(ss.str())
-      }
-
-      rx /= J;      sx /= J;      tx /= J;
-      ry /= J;      sy /= J;      ty /= J;
-      rz /= J;      sz /= J;      tz /= J;
-
-      // use this for "volume" Jacobian
-      J = sqrt(Gx*Gx+Gy*Gy+Gz*Gz);
-
-      if(J<1e-8) {
-        stringstream ss;
-        ss << "Negative J found at element " << e << "\n";
-	//	ss << "Negative J found at element " << e << "x=" << xij << " y=" << yij << " z=" << zij << "\n";
-        LIBP_ABORT(ss.str())
-      }
-
-      dfloat JW = J*cubw[n%cubNq]*cubw[n/cubNq];
-
-      /* store geometric factors */
-      int base = Nvgeo*cubNp*e + n;
-
-      cubvgeo[base + cubNp*RXID] = rx;
-      cubvgeo[base + cubNp*RYID] = ry;
-      cubvgeo[base + cubNp*RZID] = rz;
-      cubvgeo[base + cubNp*SXID] = sx;
-      cubvgeo[base + cubNp*SYID] = sy;
-      cubvgeo[base + cubNp*SZID] = sz;
-      cubvgeo[base + cubNp*TXID] = tx;
-      cubvgeo[base + cubNp*TYID] = ty;
-      cubvgeo[base + cubNp*TZID] = tz;
-      cubvgeo[base + cubNp*JID]  = J;
-      cubvgeo[base + cubNp*JWID] = JW;
-      cubvgeo[base + cubNp*IJWID] = 1./JW;
-    }
+    // for(int n=0;n<cubNq*cubNq;++n){
+
+    //   dfloat rx = cys[n]*cz[n] - czs[n]*cy[n]; // dXds x X
+    //   dfloat ry = czs[n]*cx[n] - cxs[n]*cz[n];
+    //   dfloat rz = cxs[n]*cy[n] - cys[n]*cx[n];
+
+    //   dfloat sx = czr[n]*cy[n] - cyr[n]*cz[n]; // -dXdr x X
+    //   dfloat sy = cxr[n]*cz[n] - czr[n]*cx[n];
+    //   dfloat sz = cyr[n]*cx[n] - cxr[n]*cy[n];
+
+    //   dfloat tx = cyr[n]*czs[n] - czr[n]*cys[n]; // dXdr x dXds ~ X*|dXdr x dXds|/|X|
+    //   dfloat ty = czr[n]*cxs[n] - cxr[n]*czs[n];
+    //   dfloat tz = cxr[n]*cys[n] - cyr[n]*cxs[n];
+
+    //   dfloat Gx = tx, Gy = ty, Gz = tz;
+
+    //   dfloat J = cx[n]*tx + cy[n]*ty + cz[n]*tz;
+
+    //   if(J<1e-8) {
+    //     stringstream ss;
+    //     ss << "Negative J found at element " << e << "\n";
+    //     //      ss << "Negative J found at element " << e << "x=" << xij << " y=" << yij << " z=" << zij << "\n";
+    //     LIBP_ABORT(ss.str())
+    //   }
+
+    //   rx /= J;      sx /= J;      tx /= J;
+    //   ry /= J;      sy /= J;      ty /= J;
+    //   rz /= J;      sz /= J;      tz /= J;
+
+    //   // use this for "volume" Jacobian
+    //   J = sqrt(Gx*Gx+Gy*Gy+Gz*Gz);
+
+    //   if(J<1e-8) {
+    //     stringstream ss;
+    //     ss << "Negative J found at element " << e << "\n";
+    //     //      ss << "Negative J found at element " << e << "x=" << xij << " y=" << yij << " z=" << zij << "\n";
+    //     LIBP_ABORT(ss.str())
+    //   }
+
+    //   dfloat JW = J*cubw[n%cubNq]*cubw[n/cubNq];
+
+    //   /* store geometric factors */
+    //   int base = Nvgeo*cubNp*e + n;
+
+    //   cubvgeo[base + cubNp*RXID] = rx;
+    //   cubvgeo[base + cubNp*RYID] = ry;
+    //   cubvgeo[base + cubNp*RZID] = rz;
+    //   cubvgeo[base + cubNp*SXID] = sx;
+    //   cubvgeo[base + cubNp*SYID] = sy;
+    //   cubvgeo[base + cubNp*SZID] = sz;
+    //   cubvgeo[base + cubNp*TXID] = tx;
+    //   cubvgeo[base + cubNp*TYID] = ty;
+    //   cubvgeo[base + cubNp*TZID] = tz;
+    //   cubvgeo[base + cubNp*JID]  = J;
+    //   cubvgeo[base + cubNp*JWID] = JW;
+    //   cubvgeo[base + cubNp*IJWID] = 1./JW;
+    // }
   }
 
-  halo->Exchange(vgeo, Nvgeo*Np, ogs_dfloat);
+  halo.Exchange(vgeo, Nvgeo*Np);
+
+  o_wJ   = platform.malloc<dfloat>(wJ);
+  o_vgeo = platform.malloc<dfloat>(vgeo);
+  o_ggeo = platform.malloc<dfloat>(ggeo);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshGeometricFactorsTet3D.cpp b/libs/mesh/meshGeometricFactorsTet3D.cpp
index 7afed3694..5ab3dfe06 100644
--- a/libs/mesh/meshGeometricFactorsTet3D.cpp
+++ b/libs/mesh/meshGeometricFactorsTet3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,20 +25,69 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
 
-void meshTet3D::GeometricFactors(){
+namespace libp {
+
+void mesh_t::GeometricFactorsTet3D(){
+
+  /*Set offsets*/
+  Nvgeo = 10;
+
+  RXID  = 0;
+  RYID  = 1;
+  RZID  = 2;
+  SXID  = 3;
+  SYID  = 4;
+  SZID  = 5;
+  TXID  = 6;
+  TYID  = 7;
+  TZID  = 8;
+  JID   = 9;
+
+  props["defines/" "p_Nvgeo"]= Nvgeo;
+  props["defines/" "p_RXID"]= RXID;
+  props["defines/" "p_SXID"]= SXID;
+  props["defines/" "p_TXID"]= TXID;
+
+  props["defines/" "p_RYID"]= RYID;
+  props["defines/" "p_SYID"]= SYID;
+  props["defines/" "p_TYID"]= TYID;
+
+  props["defines/" "p_RZID"]= RZID;
+  props["defines/" "p_SZID"]= SZID;
+  props["defines/" "p_TZID"]= TZID;
+
+  props["defines/" "p_JID"]= JID;
 
   /* unified storage array for geometric factors */
-  Nvgeo = 12;
-  vgeo = (dfloat*) calloc((Nelements+totalHaloPairs)*Nvgeo, sizeof(dfloat));
+  vgeo.malloc((Nelements+totalHaloPairs)*Nvgeo);
+
+  Nggeo = 6;
+
+  G00ID=0;
+  G01ID=1;
+  G02ID=2;
+  G11ID=3;
+  G12ID=4;
+  G22ID=5;
+
+  props["defines/" "p_Nggeo"]= Nggeo;
+  props["defines/" "p_G00ID"]= G00ID;
+  props["defines/" "p_G01ID"]= G01ID;
+  props["defines/" "p_G02ID"]= G02ID;
+  props["defines/" "p_G11ID"]= G11ID;
+  props["defines/" "p_G12ID"]= G12ID;
+  props["defines/" "p_G22ID"]= G22ID;
 
   /* number of second order geometric factors */
-  Nggeo = 7;
-  ggeo = (dfloat*) calloc(Nelements*Nggeo, sizeof(dfloat));
+  ggeo.malloc(Nelements*Nggeo);
+
+  wJ.malloc(Nelements);
+
 
+  // dfloat minJ = 1e9, maxJ = -1e9;
 
-  dfloat minJ = 1e9, maxJ = -1e9;
+  #pragma omp parallel for
   for(dlong e=0;e<Nelements;++e){ /* for each element */
 
     /* find vertex indices and physical coordinates */
@@ -62,13 +111,10 @@ void meshTet3D::GeometricFactors(){
     dfloat sx = -(yr*zt - zr*yt)/J, sy =  (xr*zt - zr*xt)/J, sz = -(xr*yt - yr*xt)/J;
     dfloat tx =  (yr*zs - zr*ys)/J, ty = -(xr*zs - zr*xs)/J, tz =  (xr*ys - yr*xs)/J;
 
-    if(J<0) {
-      stringstream ss;
-      ss << "Negative J found at element " << e << "\n";
-      LIBP_ABORT(ss.str())
-    }
-    minJ = mymin(minJ,J);
-    maxJ = mymax(maxJ,J);
+    LIBP_ABORT("Negative J found at element " << e, J<0);
+
+    // minJ = mymin(minJ,J);
+    // maxJ = mymax(maxJ,J);
 
     /* store geometric factors */
     vgeo[Nvgeo*e + RXID] = rx;
@@ -91,9 +137,16 @@ void meshTet3D::GeometricFactors(){
     ggeo[Nggeo*e + G11ID] = J*(sx*sx + sy*sy + sz*sz);
     ggeo[Nggeo*e + G12ID] = J*(sx*tx + sy*ty + sz*tz);
     ggeo[Nggeo*e + G22ID] = J*(tx*tx + ty*ty + tz*tz);
-    ggeo[Nggeo*e + GWJID] = J;
-  }
 
+    wJ[e] = J;
+  }
   //printf("minJ = %g, maxJ = %g\n", minJ, maxJ);
-  halo->Exchange(vgeo, Nvgeo, ogs_dfloat);
+
+  halo.Exchange(vgeo, Nvgeo);
+
+  o_wJ   = platform.malloc<dfloat>(wJ);
+  o_vgeo = platform.malloc<dfloat>(vgeo);
+  o_ggeo = platform.malloc<dfloat>(ggeo);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshGeometricFactorsTri2D.cpp b/libs/mesh/meshGeometricFactorsTri2D.cpp
index 7087cf948..9560b5e02 100644
--- a/libs/mesh/meshGeometricFactorsTri2D.cpp
+++ b/libs/mesh/meshGeometricFactorsTri2D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,19 +25,47 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
 
-void meshTri2D::GeometricFactors(){
+namespace libp {
 
-  /* unified storage array for geometric factors */
+void mesh_t::GeometricFactorsTri2D(){
+
+  /*Set offsets*/
   Nvgeo = 5;
-  vgeo = (dfloat*) calloc((Nelements+totalHaloPairs)*Nvgeo, sizeof(dfloat));
+
+  RXID  = 0;
+  RYID  = 1;
+  SXID  = 2;
+  SYID  = 3;
+  JID   = 4;
+
+  props["defines/" "p_Nvgeo"]= Nvgeo;
+  props["defines/" "p_RXID"]= RXID;
+  props["defines/" "p_SXID"]= SXID;
+  props["defines/" "p_RYID"]= RYID;
+  props["defines/" "p_SYID"]= SYID;
+  props["defines/" "p_JID"]= JID;
+
+  /* unified storage array for geometric factors */
+  vgeo.malloc((Nelements+totalHaloPairs)*Nvgeo);
+
+  Nggeo = 3;
+
+  G00ID=0;
+  G01ID=1;
+  G11ID=2;
+
+  props["defines/" "p_Nggeo"]= Nggeo;
+  props["defines/" "p_G00ID"]= G00ID;
+  props["defines/" "p_G01ID"]= G01ID;
+  props["defines/" "p_G11ID"]= G11ID;
 
   /* number of second order geometric factors */
-  Nggeo = 4;
-  ggeo = (dfloat*) calloc(Nelements*Nggeo, sizeof(dfloat));
+  ggeo.malloc(Nelements*Nggeo);
 
+  wJ.malloc(Nelements);
 
+  #pragma omp parallel for
   for(dlong e=0;e<Nelements;++e){ /* for each element */
 
     /* find vertex indices and physical coordinates */
@@ -54,11 +82,8 @@ void meshTri2D::GeometricFactors(){
     /* compute geometric factors for affine coordinate transform*/
     dfloat J = 0.25*((xe2-xe1)*(ye3-ye1) - (xe3-xe1)*(ye2-ye1));
 
-    if(J<0) {
-      stringstream ss;
-      ss << "Negative J found at element " << e << "\n";
-      LIBP_ABORT(ss.str())
-    }
+    LIBP_ABORT("Negative J found at element " << e, J<0);
+
     dfloat rx =  (0.5/J)*(ye3-ye1);
     dfloat ry = -(0.5/J)*(xe3-xe1);
     dfloat sx = -(0.5/J)*(ye2-ye1);
@@ -75,8 +100,15 @@ void meshTri2D::GeometricFactors(){
     ggeo[Nggeo*e + G00ID] = J*(rx*rx + ry*ry);
     ggeo[Nggeo*e + G01ID] = J*(rx*sx + ry*sy);
     ggeo[Nggeo*e + G11ID] = J*(sx*sx + sy*sy);
-    ggeo[Nggeo*e + GWJID]  = J;
+
+    wJ[e]  = J;
   }
 
-  halo->Exchange(vgeo, Nvgeo, ogs_dfloat);
+  halo.Exchange(vgeo, Nvgeo);
+
+  o_wJ   = platform.malloc<dfloat>(wJ);
+  o_vgeo = platform.malloc<dfloat>(vgeo);
+  o_ggeo = platform.malloc<dfloat>(ggeo);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshGeometricFactorsTri3D.cpp b/libs/mesh/meshGeometricFactorsTri3D.cpp
index bcac3c639..f9558f829 100644
--- a/libs/mesh/meshGeometricFactorsTri3D.cpp
+++ b/libs/mesh/meshGeometricFactorsTri3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,22 +25,69 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
 
 // custom geometric factors specialized for 3D tri on sphere
 
-void meshTri3D::GeometricFactors(){
+namespace libp {
 
-  /* unified storage array for geometric factors */
-  Nvgeo = 12; //
+void mesh_t::GeometricFactorsTri3D(){
+
+  /*Set offsets*/
+  Nvgeo = 10;
+
+  RXID  = 0;
+  RYID  = 1;
+  RZID  = 2;
+  SXID  = 3;
+  SYID  = 4;
+  SZID  = 5;
+  TXID  = 6;
+  TYID  = 7;
+  TZID  = 8;
+  JID   = 9;
+
+  props["defines/" "p_Nvgeo"]= Nvgeo;
+  props["defines/" "p_RXID"]= RXID;
+  props["defines/" "p_SXID"]= SXID;
+  props["defines/" "p_TXID"]= TXID;
 
+  props["defines/" "p_RYID"]= RYID;
+  props["defines/" "p_SYID"]= SYID;
+  props["defines/" "p_TYID"]= TYID;
+
+  props["defines/" "p_RZID"]= RZID;
+  props["defines/" "p_SZID"]= SZID;
+  props["defines/" "p_TZID"]= TZID;
+
+  props["defines/" "p_JID"]= JID;
+
+  /* unified storage array for geometric factors */
   /* note that we have volume geometric factors for each node */
-  vgeo = (dfloat*) calloc((Nelements+totalHaloPairs)*Nvgeo*Np, sizeof(dfloat));
+  vgeo.malloc((Nelements+totalHaloPairs)*Nvgeo*Np);
+
+  Nggeo = 6;
+
+  G00ID=0;
+  G01ID=1;
+  G02ID=2;
+  G11ID=3;
+  G12ID=4;
+  G22ID=5;
+
+  props["defines/" "p_Nggeo"]= Nggeo;
+  props["defines/" "p_G00ID"]= G00ID;
+  props["defines/" "p_G01ID"]= G01ID;
+  props["defines/" "p_G02ID"]= G02ID;
+  props["defines/" "p_G11ID"]= G11ID;
+  props["defines/" "p_G12ID"]= G12ID;
+  props["defines/" "p_G22ID"]= G22ID;
 
   /* number of second order geometric factors */
-  Nggeo = 7;
-  ggeo = (dfloat*) calloc(Nelements*Nggeo, sizeof(dfloat));
+  ggeo.malloc(Nelements*Nggeo*Np);
+
+  wJ.malloc(Nelements*Np);
 
+  #pragma omp parallel for
   for(int e=0;e<Nelements;++e){ /* for each element */
 
     for(int n=0;n<Np;++n){
@@ -83,11 +130,7 @@ void meshTri3D::GeometricFactors(){
 
       dfloat J = xn*tx + yn*ty + zn*tz;
 
-      if(J<1e-8) {
-        stringstream ss;
-        ss << "Negative J found at element " << e << "\n";
-        LIBP_ABORT(ss.str())
-      }
+      LIBP_ABORT("Negative J found at element " << e, J<1e-8);
 
       rx /= J;
       ry /= J;
@@ -104,11 +147,7 @@ void meshTri3D::GeometricFactors(){
       // use this for "volume" Jacobian
       J = sqrt(Gx*Gx+Gy*Gy+Gz*Gz);
 
-      if(J<1e-8) {
-        stringstream ss;
-        ss << "Negative J found at element " << e << "\n";
-        LIBP_ABORT(ss.str())
-      }
+      LIBP_ABORT("Negative J found at element " << e, J<1e-8);
 
       /* store geometric factors */
       int base = Nvgeo*Np*e + n;
@@ -124,13 +163,19 @@ void meshTri3D::GeometricFactors(){
       vgeo[base + Np*TZID] = tz;
       vgeo[base + Np*JID]  = J;
 
-      ggeo[Nggeo*e + G00ID] = J*(rx*rx + ry*ry + rz*rz);
-      ggeo[Nggeo*e + G01ID] = J*(rx*sx + ry*sy + rz*sz);
-      ggeo[Nggeo*e + G11ID] = J*(sx*sx + sy*sy + sz*sz);
-      ggeo[Nggeo*e + GWJID]  = J;
+      ggeo[Nggeo*e*Np+n + Np*G00ID] = J*(rx*rx + ry*ry + rz*rz);
+      ggeo[Nggeo*e*Np+n + Np*G01ID] = J*(rx*sx + ry*sy + rz*sz);
+      ggeo[Nggeo*e*Np+n + Np*G11ID] = J*(sx*sx + sy*sy + sz*sz);
 
+      wJ[e*Np + n]  = J;
     }
   }
 
-  halo->Exchange(vgeo, Nvgeo*Np, ogs_dfloat);
+  halo.Exchange(vgeo, Nvgeo*Np);
+
+  o_wJ   = platform.malloc<dfloat>(wJ);
+  o_vgeo = platform.malloc<dfloat>(vgeo);
+  o_ggeo = platform.malloc<dfloat>(ggeo);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshGeometricPartition2D.cpp b/libs/mesh/meshGeometricPartition2D.cpp
deleted file mode 100644
index 80770a7f3..000000000
--- a/libs/mesh/meshGeometricPartition2D.cpp
+++ /dev/null
@@ -1,396 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
-
-#define bitRange 15
-
-#if 0
-
-/// THIS SECTION ------------------------------------------------------------------------------------>
-// taken from: http://and-what-happened.blogspot.com/2011/08/fast-2d-and-3d-hilbert-curves-and.html
-
-unsigned int Morton_2D_Encode_16bit( unsigned int index1, unsigned int index2 )
-{ // pack 2 16-bit indices into a 32-bit Morton code
-  index1 &= 0x0000ffff;
-  index2 &= 0x0000ffff;
-  index1 |= ( index1 << 8 );
-  index2 |= ( index2 << 8 );
-  index1 &= 0x00ff00ff;
-  index2 &= 0x00ff00ff;
-  index1 |= ( index1 << 4 );
-  index2 |= ( index2 << 4 );
-  index1 &= 0x0f0f0f0f;
-  index2 &= 0x0f0f0f0f;
-  index1 |= ( index1 << 2 );
-  index2 |= ( index2 << 2 );
-  index1 &= 0x33333333;
-  index2 &= 0x33333333;
-  index1 |= ( index1 << 1 );
-  index2 |= ( index2 << 1 );
-  index1 &= 0x55555555;
-  index2 &= 0x55555555;
-  return( index1 | ( index2 << 1 ) );
-}
-
-unsigned int MortonToHilbert2D( const unsigned int morton, const unsigned int bits )
-{
-  unsigned int hilbert = 0;
-  unsigned int remap = 0xb4;
-  unsigned int block = ( bits << 1 );
-  while( block )
-    {
-      block -= 2;
-      unsigned int mcode = ( ( morton >> block ) & 3 );
-      unsigned int hcode = ( ( remap >> ( mcode << 1 ) ) & 3 );
-      remap ^= ( 0x82000028 >> ( hcode << 3 ) );
-      hilbert = ( ( hilbert << 2 ) + hcode );
-    }
-  return( hilbert );
-}
-
-
-unsigned int hilbert2D(unsigned int index1, unsigned int index2){
-
-  unsigned int morton = Morton_2D_Encode_16bit(index1,index2);
-
-  return MortonToHilbert2D(morton, 16);
-}
-
-/// THIS SECTION TO HERE <--------------------------------------------------------------------------------
-
-// spread bits of i by introducing zeros between binary bits
-unsigned long long int bitSplitter(unsigned int i){
-
-  unsigned long long int mask = 1;
-  unsigned long long int li = i;
-  unsigned long long int lj = 0;
-
-  for(int b=0;b<bitRange;++b){
-    lj |=  (li & mask) << b;
-    mask <<= 1;
-  }
-
-  return lj;
-
-}
-
-// compute Morton index of (ix,iy) relative to a bitRange x bitRange  Morton lattice
-unsigned long long int mortonIndex2D(unsigned int ix, unsigned int iy){
-
-  // spread bits of ix apart (introduce zeros)
-  unsigned long long int sx = bitSplitter(ix);
-  unsigned long long int sy = bitSplitter(iy);
-
-  // interleave bits of ix and iy
-  unsigned long long int mi = sx | (sy<<1);
-
-  return mi;
-}
-
-#else
-
-// from: https://en.wikipedia.org/wiki/Hilbert_curve
-
-//rotate/flip a quadrant appropriately
-static void rot(unsigned int n, unsigned int *x, unsigned int *y, unsigned int rx, unsigned int ry) {
-  if (ry == 0) {
-    if (rx == 1) {
-      *x = n-1 - *x;
-      *y = n-1 - *y;
-    }
-
-    //Swap x and y
-    int t  = *x;
-    *x = *y;
-    *y = t;
-  }
-}
-
-
-
-//convert (x,y) to d
-static unsigned int hilbert2D (unsigned int n, unsigned int x, unsigned int y) {
-  unsigned int rx, ry, s, d=0;
-  for (s=n/2; s>0; s/=2) {
-    rx = (x & s) > 0;
-    ry = (y & s) > 0;
-    d += s * s * ((3 * rx) ^ ry);
-    rot(s, &x, &y, rx, ry);
-  }
-  return d;
-}
-
-#endif
-
-// capsule for element vertices + Morton index
-typedef struct {
-
-  unsigned long long int index;
-
-  dlong element;
-
-  int type;
-
-  // 4 for maximum number of vertices per element in 2D
-  hlong v[4];
-
-  dfloat EX[4], EY[4];
-
-}element_t;
-
-// compare the Morton indices for two element capsules
-static int compareElements2D(const void *a, const void *b){
-
-  element_t *ea = (element_t*) a;
-  element_t *eb = (element_t*) b;
-
-  if(ea->index < eb->index) return -1;
-  if(ea->index > eb->index) return  1;
-
-  return 0;
-
-}
-
-// stub for the match function needed by parallelSort
-static void bogusMatch(void *a, void *b){ }
-
-// geometric partition of elements in 2D mesh using Morton ordering + parallelSort
-void mesh2D::GeometricPartition(){
-
-  dlong maxNelements;
-  MPI_Allreduce(&(Nelements), &maxNelements, 1, MPI_DLONG, MPI_MAX, comm);
-  maxNelements = 2*((maxNelements+1)/2);
-
-  // fix maxNelements
-  element_t *elements
-    = (element_t*) calloc(maxNelements, sizeof(element_t));
-
-  // local bounding box of element centers
-  dfloat mincx = 1e9, maxcx = -1e9;
-  dfloat mincy = 1e9, maxcy = -1e9;
-
-  // compute element centers on this process
-  for(dlong e=0;e<Nelements;++e){
-    dfloat cx = 0, cy = 0;
-    for(int n=0;n<Nverts;++n){
-      cx += EX[e*Nverts+n];
-      cy += EY[e*Nverts+n];
-    }
-    cx /= Nverts;
-    cy /= Nverts;
-
-    mincx = mymin(mincx, cx);
-    maxcx = mymax(maxcx, cx);
-    mincy = mymin(mincy, cy);
-    maxcy = mymax(maxcy, cy);
-  }
-
-  dfloat delta = 1e-1;
-  mincx -= delta;
-  mincy -= delta;
-  maxcx += delta;
-  maxcy += delta;
-
-  // find global bounding box of element centers
-  dfloat gmincx, gmincy, gmaxcx, gmaxcy;
-  MPI_Allreduce(&mincx, &gmincx, 1, MPI_DFLOAT, MPI_MIN, comm);
-  MPI_Allreduce(&mincy, &gmincy, 1, MPI_DFLOAT, MPI_MIN, comm);
-  MPI_Allreduce(&maxcx, &gmaxcx, 1, MPI_DFLOAT, MPI_MAX, comm);
-  MPI_Allreduce(&maxcy, &gmaxcy, 1, MPI_DFLOAT, MPI_MAX, comm);
-
-  dfloat maxlength = mymax(gmaxcx-gmincx, gmaxcy-gmincy);
-
-  // choose sub-range of Morton lattice coordinates to embed element centers in
-  unsigned int Nboxes = (((unsigned int)1)<<(bitRange));
-
-  // compute Morton index for each element
-  for(dlong e=0;e<Nelements;++e){
-
-    // element center coordinates
-    dfloat cx = 0, cy = 0;
-    for(int n=0;n<Nverts;++n){
-      cx += EX[e*Nverts+n];
-      cy += EY[e*Nverts+n];
-    }
-    cx /= Nverts;
-    cy /= Nverts;
-
-    // encapsulate element, vertices, Morton index, vertex coordinates
-    elements[e].element = e;
-    for(int n=0;n<Nverts;++n){
-      elements[e].v[n] = EToV[e*Nverts+n];
-      elements[e].EX[n] = EX[e*Nverts+n];
-      elements[e].EY[n] = EY[e*Nverts+n];
-    }
-
-    elements[e].type = elementInfo[e];
-
-    unsigned int ix = (cx-gmincx)*Nboxes/maxlength;
-    unsigned int iy = (cy-gmincy)*Nboxes/maxlength;
-
-    //elements[e].index = mortonIndex2D(ix, iy);
-    elements[e].index = hilbert2D(Nboxes, ix, iy);
-  }
-
-  // pad element array with dummy elements
-  for(dlong e=Nelements;e<maxNelements;++e){
-    elements[e].element = -1;
-
-    elements[e].index = hilbert2D(Nboxes, Nboxes-1, Nboxes-1);
-
-    //    elements[e].index = hilbert2D(Nboxes+1, Nboxes+1);
-    //    elements[e].index = mortonIndex2D(Nboxes+1, Nboxes+1);
-  }
-
-  // odd-even parallel sort of element capsules based on their Morton index
-  parallelSort(size, rank, comm,
-	       maxNelements, elements, sizeof(element_t),
-	       compareElements2D,
-	       bogusMatch);
-
-
-  // compress and renumber elements
-  dlong sk  = 0;
-  for(dlong e=0;e<maxNelements;++e){
-    if(elements[e].element != -1){
-      elements[sk] = elements[e];
-      ++sk;
-    }
-  }
-
-  dlong localNelements = sk;
-
-  /// redistribute elements to improve balancing
-  // TODO: We need a safer version of this for very large meshes.
-  // if dlong is a long long int Nsend and/or sendOffsets may overflow int
-  dlong *globalNelements = (dlong *) calloc(size,sizeof(dlong));
-  hlong *starts = (hlong *) calloc(size+1,sizeof(hlong));
-
-  MPI_Allgather(&localNelements, 1, MPI_DLONG, globalNelements, 1,  MPI_DLONG, comm);
-
-  for(int rr=0;rr<size;++rr)
-    starts[rr+1] = starts[rr]+globalNelements[rr];
-
-  hlong allNelements = starts[size];
-
-  // decide how many to keep on each process
-  hlong chunk = allNelements/size;
-  int remainder = (int) (allNelements - chunk*size);
-
-  int *Nsend = (int *) calloc(size, sizeof(int));
-  int *Nrecv = (int *) calloc(size, sizeof(int));
-  // int *Ncount = (int *) calloc(size, sizeof(int));
-  int *sendOffsets = (int*) calloc(size, sizeof(int));
-  int *recvOffsets = (int*) calloc(size, sizeof(int));
-
-  // Make the MPI_ELEMENT_T data type
-  MPI_Datatype MPI_ELEMENT_T;
-  MPI_Datatype dtype[6] = {MPI_LONG_LONG_INT, MPI_DLONG, MPI_INT,
-                            MPI_HLONG, MPI_DFLOAT, MPI_DFLOAT};
-  int blength[6] = {1, 1, 1, 4, 4, 4};
-  MPI_Aint addr[6], displ[6];
-  MPI_Get_address ( &(elements[0]        ), addr+0);
-  MPI_Get_address ( &(elements[0].element), addr+1);
-  MPI_Get_address ( &(elements[0].type   ), addr+2);
-  MPI_Get_address ( &(elements[0].v[0]   ), addr+3);
-  MPI_Get_address ( &(elements[0].EX[0]  ), addr+4);
-  MPI_Get_address ( &(elements[0].EY[0]  ), addr+5);
-  displ[0] = 0;
-  displ[1] = addr[1] - addr[0];
-  displ[2] = addr[2] - addr[0];
-  displ[3] = addr[3] - addr[0];
-  displ[4] = addr[4] - addr[0];
-  displ[5] = addr[5] - addr[0];
-  MPI_Type_create_struct (6, blength, displ, dtype, &MPI_ELEMENT_T);
-  MPI_Type_commit (&MPI_ELEMENT_T);
-
-  for(dlong e=0;e<localNelements;++e){
-
-    // global element index
-    elements[e].element = starts[rank]+e;
-
-    // 0, chunk+1, 2*(chunk+1) ..., remainder*(chunk+1), remainder*(chunk+1) + chunk
-    int rr;
-    if(elements[e].element<remainder*(chunk+1))
-      rr = elements[e].element/(chunk+1);
-    else
-      rr = remainder + ((elements[e].element-remainder*(chunk+1))/chunk);
-
-    ++Nsend[rr];
-  }
-
-  // find send offsets
-  for(int rr=1;rr<size;++rr)
-    sendOffsets[rr] = sendOffsets[rr-1] + Nsend[rr-1];
-
-  // exchange byte counts
-  MPI_Alltoall(Nsend, 1, MPI_INT, Nrecv, 1, MPI_INT, comm);
-
-  // count incoming clusters
-  dlong newNelements = 0;
-  for(int rr=0;rr<size;++rr)
-    newNelements += Nrecv[rr];
-
-  for(int rr=1;rr<size;++rr)
-    recvOffsets[rr] = recvOffsets[rr-1] + Nrecv[rr-1];
-
-  element_t *tmpElements = (element_t *) calloc(newNelements, sizeof(element_t));
-
-  // exchange parallel clusters
-  MPI_Alltoallv(elements, Nsend, sendOffsets, MPI_ELEMENT_T,
-                tmpElements, Nrecv, recvOffsets, MPI_ELEMENT_T, comm);
-
-  MPI_Barrier(comm);
-  MPI_Type_free(&MPI_ELEMENT_T);
-
-  // replace elements with inbound elements
-  if (elements) free(elements);
-  elements = tmpElements;
-
-  // reset number of elements and element-to-vertex connectivity from returned capsules
-  free(EToV);
-  free(EX);
-  free(EY);
-  free(elementInfo);
-
-  Nelements = newNelements;
-  EToV = (hlong*) calloc(newNelements*Nverts, sizeof(hlong));
-  EX = (dfloat*) calloc(newNelements*Nverts, sizeof(dfloat));
-  EY = (dfloat*) calloc(newNelements*Nverts, sizeof(dfloat));
-  elementInfo = (hlong*) calloc(newNelements, sizeof(hlong));
-
-  for(dlong e=0;e<newNelements;++e){
-    for(int n=0;n<Nverts;++n){
-      EToV[e*Nverts + n] = elements[e].v[n];
-      EX[e*Nverts + n]   = elements[e].EX[n];
-      EY[e*Nverts + n]   = elements[e].EY[n];
-    }
-    elementInfo[e] = elements[e].type;
-  }
-  if (elements) free(elements);
-}
diff --git a/libs/mesh/meshGeometricPartition3D.cpp b/libs/mesh/meshGeometricPartition3D.cpp
deleted file mode 100644
index 753cb82d1..000000000
--- a/libs/mesh/meshGeometricPartition3D.cpp
+++ /dev/null
@@ -1,354 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
-
-// 20 bits per coordinate
-#define bitRange 20
-
-// spread bits of i by introducing zeros between binary bits
-static unsigned long long int bitSplitter3D(unsigned int i){
-
-  unsigned long long int mask = 1;
-  unsigned long long int li = i;
-  unsigned long long int lj = 0;
-
-  for(int b=0;b<bitRange;++b){
-    lj |= ((li & mask) << 2*b); // bit b moves to bit 3b
-    mask <<= 1;
-  }
-
-  return lj;
-}
-
-// compute Morton index of (ix,iy) relative to a bitRange x bitRange  Morton lattice
-static unsigned long long int mortonIndex3D(unsigned int ix, unsigned int iy, unsigned int iz,
-                                            const int shiftx, const int shifty, const int shiftz){
-
-  // spread bits of ix apart (introduce zeros)
-  unsigned long long int sx = bitSplitter3D(ix);
-  unsigned long long int sy = bitSplitter3D(iy);
-  unsigned long long int sz = bitSplitter3D(iz);
-
-  // interleave bits of ix and iy
-  unsigned long long int mi = (sx<<shiftx) | (sy<<shifty) | (sz<<shiftz);
-
-  return mi;
-}
-
-// capsule for element vertices + Morton index
-typedef struct {
-
-  unsigned long long int index;
-
-  dlong element;
-
-  int type;
-
-  // use 8 for maximum vertices per element
-  hlong v[8];
-
-  dfloat EX[8], EY[8], EZ[8];
-
-}element_t;
-
-// compare the Morton indices for two element capsules
-static int compareElements(const void *a, const void *b){
-
-  element_t *ea = (element_t*) a;
-  element_t *eb = (element_t*) b;
-
-  if(ea->index < eb->index) return -1;
-  if(ea->index > eb->index) return  1;
-
-  return 0;
-
-}
-
-// stub for the match function needed by parallelSort
-static void bogusMatch3D(void *a, void *b){ }
-
-// geometric partition of elements in 3D mesh using Morton ordering + parallelSort
-void mesh3D::GeometricPartition(){
-
-  dlong maxNelements;
-  MPI_Allreduce(&(Nelements), &maxNelements, 1, MPI_DLONG, MPI_MAX,
-		comm);
-  maxNelements = 2*((maxNelements+1)/2);
-
-  // fix maxNelements
-  element_t *elements
-    = (element_t*) calloc(maxNelements, sizeof(element_t));
-
-  // local bounding box of element centers
-  dfloat minvx = 1e9, maxvx = -1e9;
-  dfloat minvy = 1e9, maxvy = -1e9;
-  dfloat minvz = 1e9, maxvz = -1e9;
-
-  // compute element centers on this process
-  for(dlong n=0;n<Nverts*Nelements;++n){
-    minvx = mymin(minvx, EX[n]);
-    maxvx = mymax(maxvx, EX[n]);
-    minvy = mymin(minvy, EY[n]);
-    maxvy = mymax(maxvy, EY[n]);
-    minvz = mymin(minvz, EZ[n]);
-    maxvz = mymax(maxvz, EZ[n]);
-  }
-
-  // find global bounding box of element centers
-  dfloat gminvx, gminvy, gminvz, gmaxvx, gmaxvy, gmaxvz;
-  MPI_Allreduce(&minvx, &gminvx, 1, MPI_DFLOAT, MPI_MIN, comm);
-  MPI_Allreduce(&minvy, &gminvy, 1, MPI_DFLOAT, MPI_MIN, comm);
-  MPI_Allreduce(&minvz, &gminvz, 1, MPI_DFLOAT, MPI_MIN, comm);
-  MPI_Allreduce(&maxvx, &gmaxvx, 1, MPI_DFLOAT, MPI_MAX, comm);
-  MPI_Allreduce(&maxvy, &gmaxvy, 1, MPI_DFLOAT, MPI_MAX, comm);
-  MPI_Allreduce(&maxvz, &gmaxvz, 1, MPI_DFLOAT, MPI_MAX, comm);
-
-  // choose sub-range of Morton lattice coordinates to embed element centers in
-  unsigned long long int Nboxes = (((unsigned long long int)1)<<(bitRange-1));
-
-  // Set the fastest moving indicies in the Morton ordering based on the smallest physical dimensions
-  int shiftx=0, shifty=0, shiftz=0;
-  if ((gmaxvx-gminvx)<(gmaxvy-gminvy) && (gmaxvy-gminvy)<(gmaxvz-gminvz)) {
-    shiftx=0; shifty=1; shiftz=2;
-  } else if ((gmaxvx-gminvx)<(gmaxvz-gminvz) && (gmaxvz-gminvz)<(gmaxvy-gminvy)) {
-    shiftx=0; shifty=2; shiftz=1;
-  } else if ((gmaxvy-gminvy)<(gmaxvx-gminvx) && (gmaxvx-gminvx)<(gmaxvz-gminvz)) {
-    shiftx=1; shifty=0; shiftz=2;
-  } else if ((gmaxvy-gminvy)<(gmaxvz-gminvz) && (gmaxvz-gminvz)<(gmaxvx-gminvx)) {
-    shiftx=2; shifty=0; shiftz=1;
-  } else if ((gmaxvz-gminvz)<(gmaxvx-gminvx) && (gmaxvx-gminvx)<(gmaxvy-gminvy)) {
-    shiftx=1; shifty=2; shiftz=0;
-  } else {
-    shiftx=2; shifty=1; shiftz=0;
-  }
-
-  dfloat maxlength = mymax(gmaxvx-gminvx, mymax(gmaxvy-gminvy, gmaxvz-gminvz));
-
-  // compute Morton index for each element
-  for(dlong e=0;e<Nelements;++e){
-
-    // element center coordinates
-    dfloat cx = 0, cy = 0, cz = 0;
-    for(int n=0;n<Nverts;++n){
-      cx += EX[e*Nverts+n];
-      cy += EY[e*Nverts+n];
-      cz += EZ[e*Nverts+n];
-    }
-    cx /= Nverts;
-    cy /= Nverts;
-    cz /= Nverts;
-
-    // encapsulate element, vertices, Morton index, vertex coordinates
-    elements[e].element = e;
-    for(int n=0;n<Nverts;++n){
-      elements[e].v[n] = EToV[e*Nverts+n];
-      elements[e].EX[n] = EX[e*Nverts+n];
-      elements[e].EY[n] = EY[e*Nverts+n];
-      elements[e].EZ[n] = EZ[e*Nverts+n];
-    }
-
-    elements[e].type = elementInfo[e];
-
-
-    // avoid stretching axes
-    unsigned long long int ix = (cx-gminvx)*Nboxes/maxlength;
-    unsigned long long int iy = (cy-gminvy)*Nboxes/maxlength;
-    unsigned long long int iz = (cz-gminvz)*Nboxes/maxlength;
-
-    elements[e].index = mortonIndex3D(ix, iy, iz, shiftx, shifty, shiftz);
-  }
-
-  // pad element array with dummy elements
-  for(dlong e=Nelements;e<maxNelements;++e){
-    elements[e].element = -1;
-    elements[e].index = mortonIndex3D(Nboxes+1, Nboxes+1, Nboxes+1, shiftx, shifty, shiftz);
-  }
-
-  // odd-even parallel sort of element capsules based on their Morton index
-  parallelSort(size, rank, comm,
-	       maxNelements, elements, sizeof(element_t),
-	       compareElements,
-	       bogusMatch3D);
-
-#if 0
-  // count number of elements that end up on this process
-  int cnt = 0;
-  for(int e=0;e<maxNelements;++e)
-    cnt += (elements[e].element != -1);
-
-  // reset number of elements and element-to-vertex connectivity from returned capsules
-  free(EToV);
-  free(EX);
-  free(EY);
-  free(EZ);
-
-  Nelements = cnt;
-  EToV = (int*) calloc(cnt*Nverts, sizeof(int));
-  EX = (dfloat*) calloc(cnt*Nverts, sizeof(dfloat));
-  EY = (dfloat*) calloc(cnt*Nverts, sizeof(dfloat));
-  EZ = (dfloat*) calloc(cnt*Nverts, sizeof(dfloat));
-
-  cnt = 0;
-  for(int e=0;e<maxNelements;++e){
-    if(elements[e].element != -1){
-      for(int n=0;n<Nverts;++n){
-	EToV[cnt*Nverts + n] = elements[e].v[n];
-	EX[cnt*Nverts + n]   = elements[e].EX[n];
-	EY[cnt*Nverts + n]   = elements[e].EY[n];
-	EZ[cnt*Nverts + n]   = elements[e].EZ[n];
-      }
-      ++cnt;
-    }
-  }
-#else
-  // compress and renumber elements
-  dlong sk  = 0;
-  for(dlong e=0;e<maxNelements;++e){
-    if(elements[e].element != -1){
-      elements[sk] = elements[e];
-      ++sk;
-    }
-  }
-
-  dlong localNelements = sk;
-
-  /// redistribute elements to improve balancing
-  dlong *globalNelements = (dlong *) calloc(size,sizeof(dlong));
-  hlong *starts = (hlong *) calloc(size+1,sizeof(hlong));
-
-  MPI_Allgather(&localNelements, 1, MPI_DLONG, globalNelements, 1,  MPI_DLONG, comm);
-
-  for(int rr=0;rr<size;++rr)
-    starts[rr+1] = starts[rr]+globalNelements[rr];
-
-  hlong allNelements = starts[size];
-
-  // decide how many to keep on each process
-  hlong chunk = allNelements/size;
-  int remainder = (int) (allNelements - chunk*size);
-
-  int *Nsend = (int *) calloc(size, sizeof(int));
-  int *Nrecv = (int *) calloc(size, sizeof(int));
-  // int *Ncount = (int *) calloc(size, sizeof(int));
-  int *sendOffsets = (int*) calloc(size, sizeof(int));
-  int *recvOffsets = (int*) calloc(size, sizeof(int));
-
-
-  // Make the MPI_ELEMENT_T data type
-  MPI_Datatype MPI_ELEMENT_T;
-  MPI_Datatype dtype[7] = {MPI_LONG_LONG_INT, MPI_DLONG, MPI_INT,
-                            MPI_HLONG, MPI_DFLOAT, MPI_DFLOAT, MPI_DFLOAT};
-  int blength[7] = {1, 1, 1, 8, 8, 8, 8};
-  MPI_Aint addr[7], displ[7];
-  MPI_Get_address ( &(elements[0]        ), addr+0);
-  MPI_Get_address ( &(elements[0].element), addr+1);
-  MPI_Get_address ( &(elements[0].type   ), addr+2);
-  MPI_Get_address ( &(elements[0].v[0]   ), addr+3);
-  MPI_Get_address ( &(elements[0].EX[0]  ), addr+4);
-  MPI_Get_address ( &(elements[0].EY[0]  ), addr+5);
-  MPI_Get_address ( &(elements[0].EZ[0]  ), addr+6);
-  displ[0] = 0;
-  displ[1] = addr[1] - addr[0];
-  displ[2] = addr[2] - addr[0];
-  displ[3] = addr[3] - addr[0];
-  displ[4] = addr[4] - addr[0];
-  displ[5] = addr[5] - addr[0];
-  displ[6] = addr[6] - addr[0];
-  MPI_Type_create_struct (7, blength, displ, dtype, &MPI_ELEMENT_T);
-  MPI_Type_commit (&MPI_ELEMENT_T);
-
-
-  for(dlong e=0;e<localNelements;++e){
-
-    // global element index
-    elements[e].element = starts[rank]+e;
-
-    // 0, chunk+1, 2*(chunk+1) ..., remainder*(chunk+1), remainder*(chunk+1) + chunk
-    int rr;
-    if(elements[e].element<remainder*(chunk+1))
-      rr = elements[e].element/(chunk+1);
-    else
-      rr = remainder + ((elements[e].element-remainder*(chunk+1))/chunk);
-
-    ++Nsend[rr];
-  }
-
-  // find send offsets
-  for(int rr=1;rr<size;++rr)
-    sendOffsets[rr] = sendOffsets[rr-1] + Nsend[rr-1];
-
-  // exchange byte counts
-  MPI_Alltoall(Nsend, 1, MPI_INT, Nrecv, 1, MPI_INT, comm);
-
-  // count incoming clusters
-  dlong newNelements = 0;
-  for(int rr=0;rr<size;++rr)
-    newNelements += Nrecv[rr];
-
-  for(int rr=1;rr<size;++rr)
-    recvOffsets[rr] = recvOffsets[rr-1] + Nrecv[rr-1];
-
-  element_t *tmpElements = (element_t *) calloc(newNelements, sizeof(element_t));
-
-  // exchange parallel clusters
-  MPI_Alltoallv(elements, Nsend, sendOffsets, MPI_ELEMENT_T,
-                tmpElements, Nrecv, recvOffsets, MPI_ELEMENT_T, comm);
-
-  MPI_Barrier(comm);
-  MPI_Type_free(&MPI_ELEMENT_T);
-
-  // replace elements with inbound elements
-  if (elements) free(elements);
-  elements = tmpElements;
-
-  // reset number of elements and element-to-vertex connectivity from returned capsules
-  free(EToV);
-  free(EX);
-  free(EY);
-  free(EZ);
-  free(elementInfo);
-
-  Nelements = newNelements;
-  EToV = (hlong*) calloc(newNelements*Nverts, sizeof(hlong));
-  EX = (dfloat*) calloc(newNelements*Nverts, sizeof(dfloat));
-  EY = (dfloat*) calloc(newNelements*Nverts, sizeof(dfloat));
-  EZ = (dfloat*) calloc(newNelements*Nverts, sizeof(dfloat));
-  elementInfo = (hlong*) calloc(newNelements, sizeof(hlong));
-
-  for(dlong e=0;e<newNelements;++e){
-    for(int n=0;n<Nverts;++n){
-      EToV[e*Nverts + n] = elements[e].v[n];
-      EX[e*Nverts + n]   = elements[e].EX[n];
-      EY[e*Nverts + n]   = elements[e].EY[n];
-      EZ[e*Nverts + n]   = elements[e].EZ[n];
-    }
-    elementInfo[e] = elements[e].type;
-  }
-  if (elements) free(elements);
-#endif
-}
diff --git a/libs/mesh/meshHaloRingSetup.cpp b/libs/mesh/meshHaloRingSetup.cpp
index 476b1aa14..5af4b6dd2 100644
--- a/libs/mesh/meshHaloRingSetup.cpp
+++ b/libs/mesh/meshHaloRingSetup.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,6 +26,15 @@ SOFTWARE.
 
 #include "mesh.hpp"
 
+#ifdef GLIBCXX_PARALLEL
+#include <parallel/algorithm>
+using __gnu_parallel::sort;
+#else
+using std::sort;
+#endif
+
+namespace libp {
+
 typedef struct{
 
   hlong gid;
@@ -39,75 +48,96 @@ typedef struct{
 // exchange of trace nodes
 void mesh_t::HaloRingSetup(){
 
-  //make a global indexing of element Ids
-  hlong *globalOffsets = (hlong *) calloc(size+1,sizeof(hlong));
-  hlong localNelements = (hlong) Nelements;
+  memory<hlong> globalOffset(size+1, 0);
 
   //gather number of elements on each rank
-  MPI_Allgather(&localNelements, 1, MPI_HLONG, globalOffsets+1, 1, MPI_HLONG, comm);
+  hlong localNelements = Nelements;
+  comm.Allgather(localNelements, globalOffset+1);
 
   for(int rr=0;rr<size;++rr)
-    globalOffsets[rr+1] = globalOffsets[rr]+globalOffsets[rr+1];
+    globalOffset[rr+1] = globalOffset[rr]+globalOffset[rr+1];
+
+
+  dlong Ntotal = Nverts*(Nelements+totalHaloPairs);
+
+  memory<int> minRank(Ntotal);
+  memory<int> maxRank(Ntotal);
 
-  //use the gs to find what nodes are local to this rank
-  dlong Ntotal = Np*Nelements;
-  int *minRank = (int *) calloc(Ntotal,sizeof(int));
-  int *maxRank = (int *) calloc(Ntotal,sizeof(int));
   for (dlong i=0;i<Ntotal;i++) {
     minRank[i] = rank;
     maxRank[i] = rank;
   }
 
-  ogs->GatherScatter(minRank, ogs_int, ogs_min, ogs_sym); //minRank[n] contains the smallest rank taking part in the gather of node n
-  ogs->GatherScatter(maxRank, ogs_int, ogs_max, ogs_sym); //maxRank[n] contains the largest rank taking part in the gather of node n
+  hlong gatherChange = 1;
+
+  // keep comparing numbers on positive and negative traces until convergence
+  while(gatherChange>0){
+
+    // reset change counter
+    gatherChange = 0;
+
+    // send halo data and recv into extension of buffer
+    halo.Exchange(minRank, Nverts);
+    halo.Exchange(maxRank, Nverts);
+
+    // compare trace vertices
+    #pragma omp parallel for collapse(2)
+    for(dlong e=0;e<Nelements;++e){
+      for(int n=0;n<Nfaces*NfaceVertices;++n){
+        dlong id  = e*Nfaces*NfaceVertices + n;
+        dlong idM = VmapM[id];
+        dlong idP = VmapP[id];
+
+        int minRankM = minRank[idM];
+        int minRankP = minRank[idP];
 
-  //We already made a list of the globally connected element in ParallelGatherScatterSetup
-  // NglobalGatherElements and globalGatherElementList contain the count and list
+        int maxRankM = maxRank[idM];
+        int maxRankP = maxRank[idP];
+
+        if(minRankP<minRankM){
+          gatherChange=1;
+          minRank[idM] = minRankP;
+        }
+
+        if(maxRankP>maxRankM){
+          gatherChange=1;
+          maxRank[idM] = maxRankP;
+        }
+      }
+    }
+
+    // sum up changes
+    comm.Allreduce(gatherChange);
+  }
 
   //Make a list of the elements participating in the ring exchange
   //Count the number of shared vertices in the local mesh
   dlong NsendVerts=0;
-  for (int e=0;e<NglobalGatherElements;e++) { //for all global elements
+  for (int e=0;e<Nelements;e++) { //for all global elements
     for (int v=0;v<Nverts;v++) {
-      dlong n = vertexNodes[v] + globalGatherElementList[e]*Np; //Id of a vertex in a global element
-      if ((minRank[n]!=rank)||(maxRank[n]!=rank)) { //vertex is shared
+      dlong id = e*Nverts + v; //Id of a vertex in a global element
+      if ((minRank[id]!=rank)||(maxRank[id]!=rank)) { //vertex is shared
         NsendVerts++;
       }
     }
   }
 
-  vertex_t *vertexSendList = (vertex_t*) malloc(NsendVerts*sizeof(vertex_t));
-
-  int *vertexSendCounts = (int*) calloc(size, sizeof(int));
-  int *vertexRecvCounts = (int*) calloc(size, sizeof(int));
-  int *vertexSendOffsets = (int*) calloc(size+1, sizeof(int));
-  int *vertexRecvOffsets = (int*) calloc(size+1, sizeof(int));
-
-  // Make the MPI_VERTEX_T data type
-  MPI_Datatype MPI_VERTEX_T;
-  MPI_Datatype dtype[4] = {MPI_HLONG, MPI_DLONG, MPI_DLONG, MPI_DLONG};
-  int blength[4] = {1, 1, 1, 1};
-  MPI_Aint addr[4], displ[4];
-  MPI_Get_address ( &(vertexSendList[0]        ), addr+0);
-  MPI_Get_address ( &(vertexSendList[0].element), addr+1);
-  MPI_Get_address ( &(vertexSendList[0].rank   ), addr+2);
-  MPI_Get_address ( &(vertexSendList[0].dest   ), addr+3);
-  displ[0] = 0;
-  displ[1] = addr[1] - addr[0];
-  displ[2] = addr[2] - addr[0];
-  displ[3] = addr[3] - addr[0];
-  MPI_Type_create_struct (4, blength, displ, dtype, &MPI_VERTEX_T);
-  MPI_Type_commit (&MPI_VERTEX_T);
+  memory<vertex_t> vertexSendList(NsendVerts);
+
+  memory<int> vertexSendCounts(size, 0);
+  memory<int> vertexRecvCounts(size);
+  memory<int> vertexSendOffsets(size+1);
+  memory<int> vertexRecvOffsets(size+1);
 
   NsendVerts=0;
-  for (int e=0;e<NglobalGatherElements;e++) { //for all global elements
+  for (int e=0;e<Nelements;e++) { //for all elements
     for (int v=0;v<Nverts;v++) {
-      dlong n = vertexNodes[v] + globalGatherElementList[e]*Np; //Id of a vertex in a global element
-      if ((minRank[n]!=rank)||(maxRank[n]!=rank)) { //vertex is shared
-        vertexSendList[NsendVerts].gid = globalIds[n]; //global node index
-        vertexSendList[NsendVerts].element = globalGatherElementList[e]; //local element index
+      dlong id = e*Nverts + v;
+      if ((minRank[id]!=rank)||(maxRank[id]!=rank)) { //vertex is shared
+        vertexSendList[NsendVerts].gid = EToV[id]; //global vertex index
+        vertexSendList[NsendVerts].element = e; //local element index
         vertexSendList[NsendVerts].rank = rank;
-        vertexSendList[NsendVerts].dest = globalIds[n]%size; //destination rank for sorting
+        vertexSendList[NsendVerts].dest = EToV[id]%size; //destination rank for sorting
 
         vertexSendCounts[vertexSendList[NsendVerts].dest]++; //count outgoing
 
@@ -116,19 +146,17 @@ void mesh_t::HaloRingSetup(){
     }
   }
 
-  free(minRank); free(maxRank);
-
   // sort based on destination (=gid%size)
-  std::sort(vertexSendList, vertexSendList+NsendVerts,
+  sort(vertexSendList.ptr(), vertexSendList.ptr()+NsendVerts,
             [](const vertex_t& a, const vertex_t& b)
               {return a.dest < b.dest;});
 
   // share counts
-  MPI_Alltoall(vertexSendCounts, 1, MPI_INT,
-               vertexRecvCounts, 1, MPI_INT,
-               comm);
+  comm.Alltoall(vertexSendCounts, vertexRecvCounts);
 
   dlong NrecvVerts = 0;
+  vertexSendOffsets[0] = 0;
+  vertexRecvOffsets[0] = 0;
   for(int rr=0;rr<size;++rr){
     NrecvVerts += vertexRecvCounts[rr];
 
@@ -136,15 +164,14 @@ void mesh_t::HaloRingSetup(){
     vertexRecvOffsets[rr+1] = vertexRecvOffsets[rr] + vertexRecvCounts[rr];
   }
 
-  vertex_t *vertexRecvList = (vertex_t*) malloc(NrecvVerts*sizeof(vertex_t));
+  memory<vertex_t> vertexRecvList(NrecvVerts);
 
   // exchange shared vertices
-  MPI_Alltoallv(vertexSendList, vertexSendCounts, vertexSendOffsets, MPI_VERTEX_T,
-                vertexRecvList, vertexRecvCounts, vertexRecvOffsets, MPI_VERTEX_T,
-                comm);
+  comm.Alltoallv(vertexSendList, vertexSendCounts, vertexSendOffsets,
+                 vertexRecvList, vertexRecvCounts, vertexRecvOffsets);
 
   // sort based on globalId to find matches
-  std::sort(vertexRecvList, vertexRecvList+NrecvVerts,
+  sort(vertexRecvList.ptr(), vertexRecvList.ptr()+NrecvVerts,
             [](const vertex_t& a, const vertex_t& b)
               {return a.gid < b.gid;});
 
@@ -157,8 +184,9 @@ void mesh_t::HaloRingSetup(){
   }
 
   //Build offsets to unique vertice starts
-  dlong *vertexOffsets = (dlong*) calloc(Nunique+1, sizeof(dlong));
+  memory<dlong> vertexOffsets(Nunique+1);
 
+  vertexOffsets[0] = 0;
   Nunique=(NrecvVerts) ? 1:0;
   for(dlong n=1;n<NrecvVerts;++n){
     if (vertexRecvList[n].gid != vertexRecvList[n-1].gid) { // new vertex
@@ -167,9 +195,6 @@ void mesh_t::HaloRingSetup(){
   }
   vertexOffsets[Nunique] = NrecvVerts;
 
-  //make sure the AlltoAll is done everywhere so we can reuse vertexSend arrays
-  MPI_Barrier(comm);
-
   //reset counts
   NsendVerts = 0;
   for(int rr=0;rr<size;++rr){
@@ -191,7 +216,7 @@ void mesh_t::HaloRingSetup(){
   }
 
   //resize send storage
-  vertexSendList = (vertex_t*) realloc(vertexSendList, NsendVerts*sizeof(vertex_t));
+  vertexSendList.malloc(NsendVerts);
 
   //build list of vertices to send out
   NsendVerts=0;
@@ -209,14 +234,12 @@ void mesh_t::HaloRingSetup(){
   }
 
   // sort based on destination
-  std::sort(vertexSendList, vertexSendList+NsendVerts,
+  sort(vertexSendList.ptr(), vertexSendList.ptr()+NsendVerts,
             [](const vertex_t& a, const vertex_t& b)
               {return a.dest < b.dest;});
 
   // share counts
-  MPI_Alltoall(vertexSendCounts, 1, MPI_INT,
-               vertexRecvCounts, 1, MPI_INT,
-               comm);
+  comm.Alltoall(vertexSendCounts, vertexRecvCounts);
 
   NrecvVerts = 0;
   for(int rr=0;rr<size;++rr){
@@ -227,15 +250,14 @@ void mesh_t::HaloRingSetup(){
   }
 
   //resize recv storage
-  vertexRecvList = (vertex_t*) realloc(vertexRecvList,NrecvVerts*sizeof(vertex_t));
+  vertexRecvList.malloc(NrecvVerts);
 
   // exchange shared vertices
-  MPI_Alltoallv(vertexSendList, vertexSendCounts, vertexSendOffsets, MPI_VERTEX_T,
-                vertexRecvList, vertexRecvCounts, vertexRecvOffsets, MPI_VERTEX_T,
-                comm);
+  comm.Alltoallv(vertexSendList, vertexSendCounts, vertexSendOffsets,
+                 vertexRecvList, vertexRecvCounts, vertexRecvOffsets);
 
   // sort based on rank then element id to find matches
-  std::sort(vertexRecvList, vertexRecvList+NrecvVerts,
+  sort(vertexRecvList.ptr(), vertexRecvList.ptr()+NrecvVerts,
             [](const vertex_t& a, const vertex_t& b) {
               if(a.rank < b.rank) return true;
               if(a.rank > b.rank) return false;
@@ -255,11 +277,11 @@ void mesh_t::HaloRingSetup(){
   }
 
   //make a list of global element ids taking part in the halo exchange
-  hlong *globalElementId = (hlong *) malloc((Nelements+totalRingElements)*sizeof(hlong));
+  memory<hlong> globalElementId(Nelements+totalRingElements);
 
   //outgoing elements
   for(int e=0;e<Nelements;++e)
-    globalElementId[e] = e + globalOffsets[rank] + 1;
+    globalElementId[e] = e + globalOffset[rank] + 1;
 
   //incoming elements
   totalRingElements=0;
@@ -269,7 +291,7 @@ void mesh_t::HaloRingSetup(){
     if (vertexRecvList[0].rank!=rank) {
       globalElementId[Nelements]
                        = -(vertexRecvList[0].element
-                          + globalOffsets[vertexRecvList[0].rank] + 1); //negative so doesnt contribute to sum in ogs
+                          + globalOffset[vertexRecvList[0].rank] + 1); //negative so doesnt contribute to sum in ogs
       totalRingElements++;
     }
   }
@@ -281,7 +303,7 @@ void mesh_t::HaloRingSetup(){
         ||(vertexRecvList[n].element!=vertexRecvList[n-1].element)) {
       globalElementId[Nelements+totalRingElements++]
                        = -(vertexRecvList[n].element
-                          + globalOffsets[vertexRecvList[n].rank] + 1); //negative so doesnt contribute to sum in ogs
+                          + globalOffset[vertexRecvList[n].rank] + 1); //negative so doesnt contribute to sum in ogs
     }
   }
 
@@ -290,19 +312,9 @@ void mesh_t::HaloRingSetup(){
 
   //make the halo exchange op
   int verbose = 0;
-  ringHalo = halo_t::Setup(Nelements+totalRingElements, globalElementId, comm,
-                           verbose, platform);
-
-  //clean up
-  free(globalElementId);
-  free(globalOffsets);
-
-  MPI_Barrier(comm);
-  MPI_Type_free(&MPI_VERTEX_T);
-  free(vertexSendList);
-  free(vertexRecvList);
-  free(vertexSendCounts);
-  free(vertexRecvCounts);
-  free(vertexSendOffsets);
-  free(vertexRecvOffsets);
+  ringHalo.Setup(Nelements+totalRingElements,
+                 globalElementId, comm,
+                 ogs::Auto, verbose, platform);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshHaloSetup.cpp b/libs/mesh/meshHaloSetup.cpp
index 279a80ccb..26b39a65f 100644
--- a/libs/mesh/meshHaloSetup.cpp
+++ b/libs/mesh/meshHaloSetup.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,15 +26,17 @@ SOFTWARE.
 
 #include "mesh.hpp"
 
+namespace libp {
+
 // set up halo infomation for inter-processor MPI
 // exchange of elements or trace nodes
 void mesh_t::HaloSetup(){
 
-  hlong *globalOffset = (hlong *) calloc(size+1,sizeof(hlong));
-  hlong localNelements = (hlong) Nelements;
+  memory<hlong> globalOffset(size+1, 0);
 
   //gather number of elements on each rank
-  MPI_Allgather(&localNelements, 1, MPI_HLONG, globalOffset+1, 1, MPI_HLONG, comm);
+  hlong localNelements = Nelements;
+  comm.Allgather(localNelements, globalOffset+1);
 
   for(int rr=0;rr<size;++rr)
     globalOffset[rr+1] = globalOffset[rr]+globalOffset[rr+1];
@@ -64,8 +66,8 @@ void mesh_t::HaloSetup(){
   NinternalElements = Nelements - NhaloElements;
 
   //record the halo and non-halo element ids
-  internalElementIds = (dlong*) malloc(NinternalElements*sizeof(dlong));
-  haloElementIds     = (dlong*) malloc(NhaloElements*sizeof(dlong));
+  internalElementIds.malloc(NinternalElements);
+  haloElementIds.malloc(NhaloElements);
 
   NhaloElements = 0, NinternalElements = 0;
   for(dlong e=0;e<Nelements;++e){
@@ -82,8 +84,12 @@ void mesh_t::HaloSetup(){
       internalElementIds[NinternalElements++] = e;
   }
 
+  // Send to device
+  o_internalElementIds = platform.malloc<dlong>(internalElementIds);
+  o_haloElementIds = platform.malloc<dlong>(haloElementIds);
+
   //make a list of global element ids taking part in the halo exchange
-  hlong *globalElementId = (hlong *) malloc((Nelements+totalHaloPairs)*sizeof(hlong));
+  memory<hlong> globalElementId(Nelements+totalHaloPairs);
 
   //outgoing elements
   for(int e=0;e<Nelements;++e)
@@ -106,22 +112,12 @@ void mesh_t::HaloSetup(){
   }
 
   //make a halo exchange op
-  int verbose = 0;
-  halo = halo_t::Setup(Nelements+totalHaloPairs, globalElementId, comm,
-                       verbose, platform);
-
-  free(globalElementId);
-  free(globalOffset);
-
-  // grab EX,EY,EZ from halo
-  EX = (dfloat*) realloc(EX, (Nelements+totalHaloPairs)*Nverts*sizeof(dfloat));
-  EY = (dfloat*) realloc(EY, (Nelements+totalHaloPairs)*Nverts*sizeof(dfloat));
-  if (dim==3)
-    EZ = (dfloat*) realloc(EZ, (Nelements+totalHaloPairs)*Nverts*sizeof(dfloat));
-
-  // send halo data and recv into extended part of arrays
-  halo->Exchange(EX, Nverts, ogs_dfloat);
-  halo->Exchange(EY, Nverts, ogs_dfloat);
-  if(dim==3)
-    halo->Exchange(EZ, Nverts, ogs_dfloat);
+  bool verbose = false;
+  halo.Setup(Nelements+totalHaloPairs,
+             globalElementId, comm,
+             ogs::Pairwise, verbose, platform);
+
 }
+
+} //namespace libp
+
diff --git a/libs/mesh/meshHaloTraceSetup.cpp b/libs/mesh/meshHaloTraceSetup.cpp
index 354251038..7045c4620 100644
--- a/libs/mesh/meshHaloTraceSetup.cpp
+++ b/libs/mesh/meshHaloTraceSetup.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,29 +26,23 @@ SOFTWARE.
 
 #include "mesh.hpp"
 
+namespace libp {
+
 /* Set up trace halo infomation for inter-processor MPI
    exchange of trace nodes */
 
 // Setup assumes field to be exchanged is Nelements*Nfields*Np in size
 // with Np being the fastest running index (hence each field entry is strided
 // Np apart)
-halo_t* mesh_t::HaloTraceSetup(int Nfields){
-
-  hlong *globalOffsets = (hlong *) calloc(size+1,sizeof(hlong));
-  hlong localNelements = (hlong) Nelements;
-
-  //gather number of elements on each rank
-  MPI_Allgather(&localNelements, 1, MPI_HLONG, globalOffsets+1, 1, MPI_HLONG, comm);
+ogs::halo_t mesh_t::HaloTraceSetup(int Nfields){
 
-  for(int rr=0;rr<size;++rr)
-    globalOffsets[rr+1] = globalOffsets[rr]+globalOffsets[rr+1];
-
-  hlong globalOffset = globalOffsets[rank];
-  free(globalOffsets);
+  hlong localNelements = Nelements;
+  hlong globalOffset = Nelements;
+  comm.Scan(localNelements, globalOffset);
+  globalOffset -= localNelements;
 
   //populate a global numbering system which has the Nfields stride
-  hlong *globalids = (hlong *) calloc((Nelements+totalHaloPairs)
-                                       *Np*Nfields,sizeof(hlong));
+  memory<hlong> globalids((Nelements+totalHaloPairs)*Np*Nfields);
   for (dlong e=0;e<Nelements;e++) {
     for (int k=0;k<Nfields;k++) {
       for (int n=0;n<Np;n++) {
@@ -59,7 +53,7 @@ halo_t* mesh_t::HaloTraceSetup(int Nfields){
   }
 
   //exchange full Np*Nfields per element global ids
-  halo->Exchange(globalids, Np*Nfields, ogs_hlong);
+  halo.Exchange(globalids, Np*Nfields);
 
   //flag the trace ids we need
   for (dlong e=0;e<Nelements;e++) {
@@ -87,10 +81,12 @@ halo_t* mesh_t::HaloTraceSetup(int Nfields){
   }
 
   int verbose = 0;
-  halo_t* traceHalo = halo_t::Setup((Nelements+totalHaloPairs)*Np*Nfields,
-                                    globalids, comm, verbose, platform);
-
-  free(globalids);
+  ogs::halo_t traceHalo;
+  traceHalo.Setup((Nelements+totalHaloPairs)*Np*Nfields,
+                  globalids, comm,
+                  ogs::Pairwise, verbose, platform);
 
   return traceHalo;
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshMassMatrixApply.cpp b/libs/mesh/meshMassMatrixApply.cpp
index 55ba485d9..2e0b638dd 100644
--- a/libs/mesh/meshMassMatrixApply.cpp
+++ b/libs/mesh/meshMassMatrixApply.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,16 +25,16 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
-#include "mesh/mesh3D.hpp"
 
-void mesh_t::MassMatrixApply(occa::memory& o_q, occa::memory& o_Mq) {
+namespace libp {
+
+void mesh_t::MassMatrixApply(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_Mq) {
   //compute Mq = M*q
-  MassMatrixKernel(Nelements, o_ggeo, o_MM, o_q, o_Mq);
+  MassMatrixKernel(Nelements, o_wJ, o_MM, o_q, o_Mq);
 }
 
-void meshTri2D::MassMatrixKernelSetup(int Nfields) {
-  occa::properties kernelInfo = props; //copy base occa properties
+void mesh_t::MassMatrixKernelSetupTri2D(int Nfields) {
+  properties_t kernelInfo = props; //copy base occa properties
   kernelInfo["defines/" "p_Nfields"]= Nfields;
 
   MassMatrixKernel = platform.buildKernel(MESH_DIR "/okl/MassMatrixOperatorTri2D.okl",
@@ -42,8 +42,8 @@ void meshTri2D::MassMatrixKernelSetup(int Nfields) {
                                           kernelInfo);
 }
 
-void meshQuad2D::MassMatrixKernelSetup(int Nfields) {
-  occa::properties kernelInfo = props; //copy base occa properties
+void mesh_t::MassMatrixKernelSetupQuad2D(int Nfields) {
+  properties_t kernelInfo = props; //copy base occa properties
   kernelInfo["defines/" "p_Nfields"]= Nfields;
 
   MassMatrixKernel = platform.buildKernel(MESH_DIR "/okl/MassMatrixOperatorQuad2D.okl",
@@ -51,8 +51,8 @@ void meshQuad2D::MassMatrixKernelSetup(int Nfields) {
                                           kernelInfo);
 }
 
-void meshTet3D::MassMatrixKernelSetup(int Nfields) {
-  occa::properties kernelInfo = props; //copy base occa properties
+void mesh_t::MassMatrixKernelSetupTet3D(int Nfields) {
+  properties_t kernelInfo = props; //copy base occa properties
   kernelInfo["defines/" "p_Nfields"]= Nfields;
 
   MassMatrixKernel = platform.buildKernel(MESH_DIR "/okl/MassMatrixOperatorTet3D.okl",
@@ -60,8 +60,8 @@ void meshTet3D::MassMatrixKernelSetup(int Nfields) {
                                           kernelInfo);
 }
 
-void meshHex3D::MassMatrixKernelSetup(int Nfields) {
-  occa::properties kernelInfo = props; //copy base occa properties
+void mesh_t::MassMatrixKernelSetupHex3D(int Nfields) {
+  properties_t kernelInfo = props; //copy base occa properties
   kernelInfo["defines/" "p_Nfields"]= Nfields;
 
   MassMatrixKernel = platform.buildKernel(MESH_DIR "/okl/MassMatrixOperatorHex3D.okl",
@@ -69,15 +69,4 @@ void meshHex3D::MassMatrixKernelSetup(int Nfields) {
                                           kernelInfo);
 }
 
-void meshTri3D::MassMatrixKernelSetup(int Nfields) {
-  LIBP_ABORT("MassMatrixOperatorTri3D not implemented yet.")
-}
-
-void meshQuad3D::MassMatrixKernelSetup(int Nfields) {
-  occa::properties kernelInfo = props; //copy base occa properties
-  kernelInfo["defines/" "p_Nfields"]= Nfields;
-
-  MassMatrixKernel = platform.buildKernel(MESH_DIR "/okl/MassMatrixOperatorQuad2D.okl",
-                                          "MassMatrixOperatorQuad2D",
-                                          kernelInfo);
-}
\ No newline at end of file
+} //namespace libp
diff --git a/libs/mesh/meshMinCharacteristicLength.cpp b/libs/mesh/meshMinCharacteristicLength.cpp
index 2217f3a7a..4d54cf79b 100644
--- a/libs/mesh/meshMinCharacteristicLength.cpp
+++ b/libs/mesh/meshMinCharacteristicLength.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,8 +25,8 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
-#include "mesh/mesh3D.hpp"
+
+namespace libp {
 
 dfloat mesh_t::MinCharacteristicLength(){
 
@@ -34,17 +34,15 @@ dfloat mesh_t::MinCharacteristicLength(){
   for(dlong e=0;e<Nelements;++e){
     dfloat h = ElementCharacteristicLength(e);
 
-    hmin = mymin(hmin, h);
+    hmin = std::min(hmin, h);
   }
 
   // MPI_Allreduce to get global minimum h
-  dfloat ghmin = 0.0;
-  MPI_Allreduce(&hmin, &ghmin, 1, MPI_DFLOAT, MPI_MIN, comm);
-
-  return ghmin;
+  comm.Allreduce(hmin, Comm::Min);
+  return hmin;
 }
 
-dfloat meshTri2D::ElementCharacteristicLength(dlong e) {
+dfloat mesh_t::ElementCharacteristicLengthTri2D(dlong e) {
 
   dfloat h = std::numeric_limits<dfloat>::max();
   for(int f=0;f<Nfaces;++f){
@@ -56,12 +54,12 @@ dfloat meshTri2D::ElementCharacteristicLength(dlong e) {
     // h = 2/(sJ/J)
     dfloat hest = 2.0/(sJ*invJ);
 
-    h = mymin(h, hest);
+    h = std::min(h, hest);
   }
   return h;
 }
 
-dfloat meshQuad2D::ElementCharacteristicLength(dlong e) {
+dfloat mesh_t::ElementCharacteristicLengthQuad2D(dlong e) {
 
   dfloat h = std::numeric_limits<dfloat>::max();
 
@@ -80,12 +78,12 @@ dfloat meshQuad2D::ElementCharacteristicLength(dlong e) {
     // h = 1/(sJ/J)
     dfloat hest = J/sJ;
 
-    h = mymin(h, hest);
+    h = std::min(h, hest);
   }
   return h;
 }
 
-dfloat meshTri3D::ElementCharacteristicLength(dlong e) {
+dfloat mesh_t::ElementCharacteristicLengthTet3D(dlong e) {
 
   dfloat h = std::numeric_limits<dfloat>::max();
   for(int f=0;f<Nfaces;++f){
@@ -93,16 +91,16 @@ dfloat meshTri3D::ElementCharacteristicLength(dlong e) {
     dfloat sJ   = sgeo[sid + SJID];
     dfloat invJ = sgeo[sid + IJID];
 
-    // sJ = L/2, J = A/2,   sJ/J = L/A = L/(0.5*h*L) = 2/h
+    // sJ = A/2, J = 3*V/4,   sJ/J = 2*A/3*V = 2*A/3*(A*h/3) = 2/h
     // h = 2/(sJ/J)
     dfloat hest = 2.0/(sJ*invJ);
 
-    h = mymin(h, hest);
+    h = std::min(h, hest);
   }
   return h;
 }
 
-dfloat meshQuad3D::ElementCharacteristicLength(dlong e) {
+dfloat mesh_t::ElementCharacteristicLengthHex3D(dlong e) {
 
   dfloat h = std::numeric_limits<dfloat>::max();
 
@@ -121,48 +119,9 @@ dfloat meshQuad3D::ElementCharacteristicLength(dlong e) {
     // h = 1/(sJ/J)
     dfloat hest = J/sJ;
 
-    h = mymin(h, hest);
-  }
-  return h;
-}
-
-dfloat meshTet3D::ElementCharacteristicLength(dlong e) {
-
-  dfloat h = std::numeric_limits<dfloat>::max();
-  for(int f=0;f<Nfaces;++f){
-    dlong sid = Nsgeo*(Nfaces*e + f);
-    dfloat sJ   = sgeo[sid + SJID];
-    dfloat invJ = sgeo[sid + IJID];
-
-    // sJ = A/2, J = 3*V/4,   sJ/J = 2*A/3*V = 2*A/3*(A*h/3) = 2/h
-    // h = 2/(sJ/J)
-    dfloat hest = 2.0/(sJ*invJ);
-
-    h = mymin(h, hest);
+    h = std::min(h, hest);
   }
   return h;
 }
 
-dfloat meshHex3D::ElementCharacteristicLength(dlong e) {
-
-  dfloat h = std::numeric_limits<dfloat>::max();
-
-  //sum weighted Jacobians to integrate over the element
-  dfloat J = 0.0;
-  for (int n=0;n<Np;n++)
-    J += vgeo[Nvgeo*Np*e + n + Np*JWID];
-
-  for(int f=0;f<Nfaces;++f){
-    //sum weighted surface Jacobians to integrate over face
-    dfloat sJ = 0.0;
-    for (int i=0;i<Nfp;i++)
-      sJ += sgeo[Nsgeo*(Nfaces*Nfp*e + Nfp*f + i) + WSJID];
-
-    // sJ = L, J = A,   sJ/J = L/A = L/(h*L) = 1/h
-    // h = 1/(sJ/J)
-    dfloat hest = J/sJ;
-
-    h = mymin(h, hest);
-  }
-  return h;
-}
\ No newline at end of file
+} //namespace libp
diff --git a/libs/mesh/meshMultiRateHaloTraceSetup.cpp b/libs/mesh/meshMultiRateHaloTraceSetup.cpp
index dac2791bf..31a621488 100644
--- a/libs/mesh/meshMultiRateHaloTraceSetup.cpp
+++ b/libs/mesh/meshMultiRateHaloTraceSetup.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,29 +26,23 @@ SOFTWARE.
 
 #include "mesh.hpp"
 
+namespace libp {
+
 /* Set up trace halo infomation for inter-processor MPI
    exchange of trace nodes */
 
 // Setup assumes field to be exchanged is Nelements*Nfields**Nfaces*Nfp in size
 // with Nfp being the fastest running index (hence each field entry is strided
 // Nfaces*Nfp apart)
-halo_t** mesh_t::MultiRateHaloTraceSetup(int Nfields){
-
-  hlong *globalOffsets = (hlong *) calloc(size+1,sizeof(hlong));
-  hlong localNelements = (hlong) Nelements;
-
-  //gather number of elements on each rank
-  MPI_Allgather(&localNelements, 1, MPI_HLONG, globalOffsets+1, 1, MPI_HLONG, comm);
+memory<ogs::halo_t> mesh_t::MultiRateHaloTraceSetup(int Nfields){
 
-  for(int rr=0;rr<size;++rr)
-    globalOffsets[rr+1] = globalOffsets[rr]+globalOffsets[rr+1];
-
-  hlong globalOffset = globalOffsets[rank];
-  free(globalOffsets);
+  hlong localNelements = Nelements;
+  hlong globalOffset = Nelements;
+  comm.Scan(localNelements, globalOffset);
+  globalOffset -= localNelements;
 
   //populate a global numbering system which has the Nfields stride
-  hlong *globalids = (hlong *) calloc((Nelements+totalHaloPairs)
-                                       *Np*Nfields,sizeof(hlong));
+  memory<hlong> globalids((Nelements+totalHaloPairs)*Np*Nfields);
   for (dlong e=0;e<Nelements;e++) {
     for (int k=0;k<Nfields;k++) {
       for (int n=0;n<Np;n++) {
@@ -59,9 +53,8 @@ halo_t** mesh_t::MultiRateHaloTraceSetup(int Nfields){
   }
 
   //make a trace array populated with the global ids
-  hlong *traceIds = (hlong *) calloc((Nelements+totalHaloPairs)
-                                       *Nfp*Nfaces*Nfields,sizeof(hlong));
-
+  memory<hlong> traceIds((Nelements+totalHaloPairs)
+                         *Nfp*Nfaces*Nfields);
   for (dlong e=0;e<Nelements;e++) {
     for (int n=0;n<Nfp*Nfaces;n++) {
       const dlong vid = e*Nfp*Nfaces + n;
@@ -80,7 +73,7 @@ halo_t** mesh_t::MultiRateHaloTraceSetup(int Nfields){
   }
 
   //exchange full Nfp*Nfaces*Nfields per element global trace ids
-  halo->Exchange(traceIds, Nfp*Nfaces*Nfields, ogs_hlong);
+  halo.Exchange(traceIds, Nfp*Nfaces*Nfields);
 
   //the halo region is filled, but there are duplicate IDs in the local section
   // bad news for the halo exchange, so remove them
@@ -104,12 +97,17 @@ halo_t** mesh_t::MultiRateHaloTraceSetup(int Nfields){
   }
 
   //make array of halo exchangers
-  halo_t** mrTraceHalo = (halo_t **) malloc(mrNlevels*sizeof(halo_t*));
+  memory<ogs::halo_t> mrTraceHalo(mrNlevels);
 
   //make a global trace id array to be used for exchange on each multirate level
-  hlong *mrTraceIds = (hlong *) calloc((Nelements+totalHaloPairs)
-                                       *Nfp*Nfaces*Nfields,sizeof(hlong));
-  memcpy(mrTraceIds, traceIds, Nelements*Nfp*Nfaces*Nfields*sizeof(hlong)); //copy local part
+  memory<hlong> mrTraceIds((Nelements+totalHaloPairs)
+                           *Nfp*Nfaces*Nfields);
+  mrTraceIds.copyFrom(traceIds, Nelements*Nfp*Nfaces*Nfields); //copy local part
+
+  /*Zero halo region*/
+  for (dlong n=0;n<totalHaloPairs*Nfp*Nfaces*Nfields;n++) {
+    mrTraceIds[n+Nelements*Nfp*Nfaces*Nfields] = 0;
+  }
 
   //for each multirate level
   for (int lev=0;lev<mrNlevels;lev++) {
@@ -137,16 +135,15 @@ halo_t** mesh_t::MultiRateHaloTraceSetup(int Nfields){
     }
 
     int verbose = 0;
-    mrTraceHalo[lev] = halo_t::Setup((Nelements+totalHaloPairs)*Nfp*Nfaces*Nfields,
-                                      mrTraceIds, comm, verbose, platform);
+    mrTraceHalo[lev].Setup((Nelements+totalHaloPairs)*Nfp*Nfaces*Nfields,
+                            mrTraceIds, comm,
+                            ogs::Pairwise, verbose, platform);
 
     //no need to zero out mrTraceIds for next multirate level
     // the next level set includes the lower level elements
   }
 
-  free(globalids);
-  free(traceIds);
-  free(mrTraceIds);
-
   return mrTraceHalo;
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshMultiRateSetup.cpp b/libs/mesh/meshMultiRateSetup.cpp
index 176ab993c..758207d1d 100644
--- a/libs/mesh/meshMultiRateSetup.cpp
+++ b/libs/mesh/meshMultiRateSetup.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,23 +26,21 @@ SOFTWARE.
 
 #include "mesh.hpp"
 
-void mesh_t::MultiRateSetup(dfloat *EToDT) {
+namespace libp {
+
+void mesh_t::MultiRateSetup(memory<dfloat> EToDT) {
 
   const int maxLevels = 100;
 
   //find global min and max dt
-  dfloat dtmin=1.e9, dtmax=0.0;
-  if (Nelements) {
-    dtmin = EToDT[0];
-    dtmax = EToDT[0];
-  }
-  for (dlong e=1;e<Nelements;e++) {
-    dtmin = mymin(dtmin,EToDT[e]);
-    dtmax = mymax(dtmax,EToDT[e]);
+  dfloat dtmin = std::numeric_limits<dfloat>::max();
+  dfloat dtmax = std::numeric_limits<dfloat>::min();
+  for (dlong e=0;e<Nelements;e++) {
+    dtmin = std::min(dtmin,EToDT[e]);
+    dtmax = std::max(dtmax,EToDT[e]);
   }
-  dfloat dtGmin, dtGmax;
-  MPI_Allreduce(&dtmin, &dtGmin, 1, MPI_DFLOAT, MPI_MIN, comm);
-  MPI_Allreduce(&dtmax, &dtGmax, 1, MPI_DFLOAT, MPI_MIN, comm);
+  comm.Allreduce(dtmin, Comm::Min);
+  comm.Allreduce(dtmax, Comm::Max);
 
   if (rank==0) {
     printf("--------------- MultiRate Timestepping Setup ----------------\n");
@@ -50,22 +48,24 @@ void mesh_t::MultiRateSetup(dfloat *EToDT) {
   }
 
   //number of levels
-  mrNlevels = mymin(floor(log2(dtGmax/dtGmin))+1,maxLevels);
+  mrNlevels = std::min(static_cast <int>(std::floor(std::log2(dtmax/dtmin)))+1,
+                                         maxLevels);
 
   //compute the level of each element
-  mrLevel = (int *) calloc(Nelements+totalHaloPairs,sizeof(int));
+  mrLevel.malloc(Nelements+totalHaloPairs);
   for(int lev=0; lev<mrNlevels; lev++){
-    dfloat dtlev = dtGmin*(2<<lev);
+    dfloat dtlev = dtmin*(1<<lev);
     for(dlong e=0;e<Nelements;++e){
-      if(EToDT[e] >=dtlev)
+      if(EToDT[e] >=dtlev) {
         mrLevel[e] = lev;
+      }
     }
   }
 
   //enforce one level difference between neighbours
   for (int lev=0; lev < mrNlevels; lev++){
 
-    halo->Exchange(mrLevel, 1, ogs_int);
+    halo.Exchange(mrLevel, 1);
 
     for (dlong e=0; e<Nelements;e++) {
       if (mrLevel[e] > lev+1) { //find elements at least 2 levels higher than lev
@@ -82,21 +82,20 @@ void mesh_t::MultiRateSetup(dfloat *EToDT) {
   //this could change the number of levels there are, so find the new max level
   mrNlevels = 0;
   for (dlong e=0;e<Nelements;e++)
-    mrNlevels = (mrLevel[e]>mrNlevels) ? mrLevel[e] : mrNlevels;
+    mrNlevels = std::max(mrLevel[e],mrNlevels);
   mrNlevels++;
 
-  int localNlevels = mrNlevels;
-  MPI_Allreduce(&localNlevels, &mrNlevels, 1, MPI_INT, MPI_MAX, comm);
+  comm.Allreduce(mrNlevels, Comm::Max);
 
   //construct element and halo lists
   // mrElements[lev] - list of all elements with multirate level <= lev
   // mrInterfaceElements[lev] - list of all elements with multirate level = lev,
   //                                with a neighbor of level lev-1
-  mrNelements          = (dlong *) calloc(mrNlevels,sizeof(dlong));
-  mrInterfaceNelements = (dlong *) calloc(mrNlevels,sizeof(dlong));
+  mrNelements.malloc(mrNlevels, 0);
+  mrInterfaceNelements.malloc(mrNlevels, 0);
 
-  mrElements          = (dlong **) calloc(mrNlevels,sizeof(dlong*));
-  mrInterfaceElements = (dlong **) calloc(mrNlevels,sizeof(dlong*));
+  mrElements.malloc(mrNlevels);
+  mrInterfaceElements.malloc(mrNlevels);
 
   for (dlong e=0;e<Nelements;e++) {
     int lev = mrLevel[e];
@@ -116,12 +115,12 @@ void mesh_t::MultiRateSetup(dfloat *EToDT) {
 
   //allocate space
   for (int lev =0;lev<mrNlevels;lev++){
-    mrElements[lev]          = (dlong *) calloc(mrNelements[lev],sizeof(dlong));
-    mrInterfaceElements[lev] = (dlong *) calloc(mrInterfaceNelements[lev],sizeof(dlong));
+    mrElements[lev].malloc(mrNelements[lev]);
+    mrInterfaceElements[lev].malloc(mrInterfaceNelements[lev]);
   }
 
-  int *cnt  = (int *) calloc(mrNlevels,sizeof(int));
-  int *cnt2 = (int *) calloc(mrNlevels,sizeof(int));
+  memory<int> cnt(mrNlevels, 0);
+  memory<int> cnt2(mrNlevels, 0);
 
   //fill element lists
   for (dlong e=0;e<Nelements;e++){
@@ -139,20 +138,19 @@ void mesh_t::MultiRateSetup(dfloat *EToDT) {
       }
     }
   }
-  free(cnt); free(cnt2);
 
-  o_mrLevel = platform.malloc(Nelements*sizeof(int), mrLevel);
-  o_mrNelements = platform.malloc(mrNlevels*sizeof(dlong), mrNelements);
-  o_mrInterfaceNelements = platform.malloc(mrNlevels*sizeof(dlong), mrInterfaceNelements);
+  o_mrLevel = platform.malloc<int>(Nelements, mrLevel);
+  o_mrNelements = platform.malloc<dlong>(mrNlevels, mrNelements);
+  o_mrInterfaceNelements = platform.malloc<dlong>(mrNlevels, mrInterfaceNelements);
 
-  o_mrElements          = new occa::memory[mrNlevels];
-  o_mrInterfaceElements = new occa::memory[mrNlevels];
+  o_mrElements.malloc(mrNlevels);
+  o_mrInterfaceElements.malloc(mrNlevels);
 
   for (int lev =0;lev<mrNlevels;lev++){
     if (mrNelements[lev])
-      o_mrElements[lev]          = platform.malloc(mrNelements[lev]*sizeof(dlong), mrElements[lev]);
+      o_mrElements[lev]          = platform.malloc<dlong>(mrNelements[lev], mrElements[lev]);
     if (mrInterfaceNelements[lev])
-      o_mrInterfaceElements[lev] = platform.malloc(mrInterfaceNelements[lev]*sizeof(dlong), mrInterfaceElements[lev]);
+      o_mrInterfaceElements[lev] = platform.malloc<dlong>(mrInterfaceNelements[lev], mrInterfaceElements[lev]);
   }
 
   if (rank==0){
@@ -163,15 +161,13 @@ void mesh_t::MultiRateSetup(dfloat *EToDT) {
   hlong Ntotal=0;
   for (int lev=0; lev<mrNlevels; lev++) {
 
-    hlong levNelementsLocal = mrNelements[lev];
-    hlong levNelements=0;
-    MPI_Allreduce(&levNelementsLocal, &levNelements, 1, MPI_HLONG, MPI_SUM, comm);
+    hlong levNelements = mrNelements[lev];
+    comm.Allreduce(levNelements);
     levNelements -= Ntotal;
     Ntotal += levNelements;
 
-    dlong levInterfaceNelementsLocal = mrInterfaceNelements[lev];
-    dlong levInterfaceNelements=0;
-    MPI_Allreduce(&levInterfaceNelementsLocal, &levInterfaceNelements, 1, MPI_DLONG, MPI_SUM, comm);
+    dlong levInterfaceNelements = mrInterfaceNelements[lev];
+    comm.Allreduce(levInterfaceNelements);
 
     if (rank==0)
       printf("|   %3d |      %12lu |                 %12lu  |\n", lev, (size_t)levNelements, (size_t)levInterfaceNelements);
@@ -179,3 +175,5 @@ void mesh_t::MultiRateSetup(dfloat *EToDT) {
   if (rank==0)
     printf("-------------------------------------------------------------\n");
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshOccaSetup.cpp b/libs/mesh/meshOccaSetup.cpp
deleted file mode 100644
index 2ac7b4dfb..000000000
--- a/libs/mesh/meshOccaSetup.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "mesh.hpp"
-
-void mesh_t::OccaSetup(){
-
-  if(NinternalElements)
-    o_internalElementIds    =
-      platform.malloc(NinternalElements*sizeof(dlong), internalElementIds);
-
-  if(NhaloElements)
-    o_haloElementIds = platform.malloc(NhaloElements*sizeof(dlong), haloElementIds);
-
-  if(NglobalGatherElements)
-    o_globalGatherElementList =
-      platform.malloc(NglobalGatherElements*sizeof(dlong), globalGatherElementList);
-
-  if(NlocalGatherElements)
-    o_localGatherElementList =
-      platform.malloc(NlocalGatherElements*sizeof(dlong), localGatherElementList);
-
-  o_vmapM = platform.malloc(Nelements*Nfp*Nfaces*sizeof(dlong), vmapM);
-  o_vmapP = platform.malloc(Nelements*Nfp*Nfaces*sizeof(dlong), vmapP);
-  o_mapP  = platform.malloc(Nelements*Nfp*Nfaces*sizeof(dlong), mapP);
-
-  o_EToB = platform.malloc(Nelements*Nfaces*sizeof(int), EToB);
-
-  props["defines/" "p_dim"]= dim;
-  props["defines/" "p_N"]= N;
-  props["defines/" "p_Nq"]= N+1;
-  props["defines/" "p_Np"]= Np;
-  props["defines/" "p_Nfp"]= Nfp;
-  props["defines/" "p_Nfaces"]= Nfaces;
-  props["defines/" "p_NfacesNfp"]= Nfp*Nfaces;
-  props["defines/" "p_Nvgeo"]= Nvgeo;
-  props["defines/" "p_Nsgeo"]= Nsgeo;
-  props["defines/" "p_Nggeo"]= Nggeo;
-}
diff --git a/libs/mesh/meshOccaSetup2D.cpp b/libs/mesh/meshOccaSetup2D.cpp
deleted file mode 100644
index 25de1659c..000000000
--- a/libs/mesh/meshOccaSetup2D.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
-
-void mesh2D::OccaSetup(){
-
-  this->mesh_t::OccaSetup();
-
-  o_x = platform.malloc(Nelements*Np*sizeof(dfloat), x);
-  o_y = platform.malloc(Nelements*Np*sizeof(dfloat), y);
-  o_z = o_y; // dummy z variable
-
-  props["defines/" "p_NXID"]= NXID;
-  props["defines/" "p_NYID"]= NYID;
-  props["defines/" "p_SJID"]= SJID;
-  props["defines/" "p_IJID"]= IJID;
-  props["defines/" "p_IHID"]= IHID;
-  props["defines/" "p_WIJID"]= WIJID;
-  props["defines/" "p_WSJID"]= WSJID;
-
-  props["defines/" "p_G00ID"]= G00ID;
-  props["defines/" "p_G01ID"]= G01ID;
-  props["defines/" "p_G11ID"]= G11ID;
-  props["defines/" "p_GWJID"]= GWJID;
-
-  props["defines/" "p_RXID"]= RXID;
-  props["defines/" "p_SXID"]= SXID;
-  props["defines/" "p_RYID"]= RYID;
-  props["defines/" "p_SYID"]= SYID;
-
-  props["defines/" "p_JID"]= JID;
-  props["defines/" "p_JWID"]= JWID;
-  props["defines/" "p_IJWID"]= IJWID;
-
-}
diff --git a/libs/mesh/meshOccaSetup3D.cpp b/libs/mesh/meshOccaSetup3D.cpp
deleted file mode 100644
index b0586c46c..000000000
--- a/libs/mesh/meshOccaSetup3D.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
-
-void mesh3D::OccaSetup(){
-
-  this->mesh_t::OccaSetup();
-
-  o_x = platform.malloc(Nelements*Np*sizeof(dfloat), x);
-  o_y = platform.malloc(Nelements*Np*sizeof(dfloat), y);
-  o_z = platform.malloc(Nelements*Np*sizeof(dfloat), z);
-
-  props["defines/" "p_NXID"]= NXID;
-  props["defines/" "p_NYID"]= NYID;
-  props["defines/" "p_NZID"]= NZID;
-  props["defines/" "p_SJID"]= SJID;
-  props["defines/" "p_IJID"]= IJID;
-  props["defines/" "p_IHID"]= IHID;
-  props["defines/" "p_WSJID"]= WSJID;
-  props["defines/" "p_WIJID"]= WIJID;
-  props["defines/" "p_STXID"]= STXID;
-  props["defines/" "p_STYID"]= STYID;
-  props["defines/" "p_STZID"]= STZID;
-  props["defines/" "p_SBXID"]= SBXID;
-  props["defines/" "p_SBYID"]= SBYID;
-  props["defines/" "p_SBZID"]= SBZID;
-
-  props["defines/" "p_G00ID"]= G00ID;
-  props["defines/" "p_G01ID"]= G01ID;
-  props["defines/" "p_G02ID"]= G02ID;
-  props["defines/" "p_G11ID"]= G11ID;
-  props["defines/" "p_G12ID"]= G12ID;
-  props["defines/" "p_G22ID"]= G22ID;
-  props["defines/" "p_GWJID"]= GWJID;
-
-
-  props["defines/" "p_RXID"]= RXID;
-  props["defines/" "p_SXID"]= SXID;
-  props["defines/" "p_TXID"]= TXID;
-
-  props["defines/" "p_RYID"]= RYID;
-  props["defines/" "p_SYID"]= SYID;
-  props["defines/" "p_TYID"]= TYID;
-
-  props["defines/" "p_RZID"]= RZID;
-  props["defines/" "p_SZID"]= SZID;
-  props["defines/" "p_TZID"]= TZID;
-
-  props["defines/" "p_JID"]= JID;
-  props["defines/" "p_JWID"]= JWID;
-  props["defines/" "p_IJWID"]= IJWID;
-}
diff --git a/libs/mesh/meshOccaSetupHex3D.cpp b/libs/mesh/meshOccaSetupHex3D.cpp
deleted file mode 100644
index acfb681b8..000000000
--- a/libs/mesh/meshOccaSetupHex3D.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
-
-void meshHex3D::OccaSetup(){
-
-  this->mesh3D::OccaSetup();
-
-  o_D = platform.malloc(Nq*Nq*sizeof(dfloat), D);
-
-  o_S    = o_D; //dummy
-  o_MM   = o_D; //dummy
-  o_sM   = o_D; //dummy
-  o_LIFT = o_D; //dummy
-
-  o_vgeo = platform.malloc((Nelements+totalHaloPairs)*Nvgeo*Np*sizeof(dfloat), vgeo);
-  o_sgeo = platform.malloc(Nelements*Nfaces*Nfp*Nsgeo*sizeof(dfloat), sgeo);
-  o_ggeo = platform.malloc(Nelements*Np*Nggeo*sizeof(dfloat), ggeo);
-
-  /* NC: disabling until we re-add treatment of affine elements
-
-  // build trilinear geometric factors for hexes
-  if(settings.compareSetting("ELEMENT MAP", "AFFINE")){
-    // pack gllz, gllw, and elementwise EXYZ
-    hlong Nxyz = Nelements*dim*Nverts;
-    EXYZ  = (dfloat*) calloc(Nxyz, sizeof(dfloat));
-    gllzw = (dfloat*) calloc(2*Nq, sizeof(dfloat));
-
-    int sk = 0;
-    for(int n=0;n<Nq;++n)
-      gllzw[sk++] = gllz[n];
-    for(int n=0;n<Nq;++n)
-      gllzw[sk++] = gllw[n];
-
-    sk = 0;
-    for(hlong e=0;e<Nelements;++e){
-      for(int v=0;v<Nverts;++v)
-        EXYZ[sk++] = EX[e*Nverts+v];
-      for(int v=0;v<Nverts;++v)
-        EXYZ[sk++] = EY[e*Nverts+v];
-      for(int v=0;v<Nverts;++v)
-        EXYZ[sk++] = EZ[e*Nverts+v];
-    }
-
-    // nodewise ggeo with element coordinates and gauss node info
-    o_EXYZ  = device.malloc(Nxyz*sizeof(dfloat), EXYZ);
-    o_gllzw = device.malloc(2*Nq*sizeof(dfloat), gllzw);
-  }
-
-  ggeoNoJW = (dfloat*) calloc(Np*Nelements*6,sizeof(dfloat));
-  for(int e=0;e<Nelements;++e){
-    for(int n=0;n<Np;++n){
-      ggeoNoJW[e*Np*6 + n + 0*Np] = ggeo[e*Np*Nggeo + n + G00ID*Np];
-      ggeoNoJW[e*Np*6 + n + 1*Np] = ggeo[e*Np*Nggeo + n + G01ID*Np];
-      ggeoNoJW[e*Np*6 + n + 2*Np] = ggeo[e*Np*Nggeo + n + G02ID*Np];
-      ggeoNoJW[e*Np*6 + n + 3*Np] = ggeo[e*Np*Nggeo + n + G11ID*Np];
-      ggeoNoJW[e*Np*6 + n + 4*Np] = ggeo[e*Np*Nggeo + n + G12ID*Np];
-      ggeoNoJW[e*Np*6 + n + 5*Np] = ggeo[e*Np*Nggeo + n + G22ID*Np];
-    }
-  }
-  o_ggeoNoJW = device.malloc(Np*Nelements*6*sizeof(dfloat), ggeoNoJW);
-  */
-}
diff --git a/libs/mesh/meshOccaSetupTet3D.cpp b/libs/mesh/meshOccaSetupTet3D.cpp
deleted file mode 100644
index 4c315e857..000000000
--- a/libs/mesh/meshOccaSetupTet3D.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
-
-void meshTet3D::OccaSetup(){
-
-  this->mesh3D::OccaSetup();
-
-  // build transposes (we hold matrices as column major on device)
-  dfloat *DT = (dfloat*) calloc(3*Np*Np, sizeof(dfloat));
-  dfloat *DrT = DT + 0*Np*Np;
-  dfloat *DsT = DT + 1*Np*Np;
-  dfloat *DtT = DT + 2*Np*Np;
-  matrixTranspose(Np, Np, Dr, Np, DrT, Np);
-  matrixTranspose(Np, Np, Ds, Np, DsT, Np);
-  matrixTranspose(Np, Np, Dt, Np, DtT, Np);
-
-  dfloat *LIFTT = (dfloat*) calloc(Np*Nfaces*Nfp, sizeof(dfloat));
-  matrixTranspose(Np, Nfp*Nfaces, LIFT, Nfp*Nfaces, LIFTT, Np);
-
-  dfloat *sMT = (dfloat *) calloc(Np*Nfaces*Nfp,sizeof(dfloat));
-  matrixTranspose(Np, Nfp*Nfaces, sM, Nfp*Nfaces, sMT, Np);
-
-  dfloat *ST = (dfloat*) calloc(6*Np*Np, sizeof(dfloat));
-  dfloat *SrrT = ST + 0*Np*Np;
-  dfloat *SrsT = ST + 1*Np*Np;
-  dfloat *SrtT = ST + 2*Np*Np;
-  dfloat *SssT = ST + 3*Np*Np;
-  dfloat *SstT = ST + 4*Np*Np;
-  dfloat *SttT = ST + 5*Np*Np;
-  matrixTranspose(Np, Np, Srr, Np, SrrT, Np);
-  matrixTranspose(Np, Np, Srs, Np, SrsT, Np);
-  matrixTranspose(Np, Np, Srt, Np, SrtT, Np);
-  matrixTranspose(Np, Np, Sss, Np, SssT, Np);
-  matrixTranspose(Np, Np, Sst, Np, SstT, Np);
-  matrixTranspose(Np, Np, Stt, Np, SttT, Np);
-
-  o_D = platform.malloc(3*Np*Np*sizeof(dfloat), DT);
-  o_MM = platform.malloc(Np*Np*sizeof(dfloat), MM); //MM is symmetric
-
-  o_sM = platform.malloc(Np*Nfaces*Nfp*sizeof(dfloat), sMT);
-
-  o_LIFT = platform.malloc(Np*Nfaces*Nfp*sizeof(dfloat), LIFTT);
-
-  o_S = platform.malloc(6*Np*Np*sizeof(dfloat), ST);
-
-  o_vgeo = platform.malloc((Nelements+totalHaloPairs)*Nvgeo*sizeof(dfloat), vgeo);
-  o_sgeo = platform.malloc(Nelements*Nfaces*Nsgeo*sizeof(dfloat), sgeo);
-  o_ggeo = platform.malloc(Nelements*Nggeo*sizeof(dfloat), ggeo);
-
-  free(DT);
-  free(LIFTT);
-  free(sMT);
-  free(ST);
-}
diff --git a/libs/mesh/meshOccaSetupTri2D.cpp b/libs/mesh/meshOccaSetupTri2D.cpp
deleted file mode 100644
index 249d1e8c2..000000000
--- a/libs/mesh/meshOccaSetupTri2D.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
-
-void meshTri2D::OccaSetup(){
-
-  this->mesh2D::OccaSetup();
-
-  // build transposes (we hold matrices as column major on device)
-  dfloat *DT = (dfloat*) calloc(2*Np*Np, sizeof(dfloat));
-  dfloat *DrT = DT + 0*Np*Np;
-  dfloat *DsT = DT + 1*Np*Np;
-  matrixTranspose(Np, Np, Dr, Np, DrT, Np);
-  matrixTranspose(Np, Np, Ds, Np, DsT, Np);
-
-  dfloat *LIFTT = (dfloat*) calloc(Np*Nfaces*Nfp, sizeof(dfloat));
-  matrixTranspose(Np, Nfp*Nfaces, LIFT, Nfp*Nfaces, LIFTT, Np);
-
-  dfloat *sMT = (dfloat *) calloc(Np*Nfaces*Nfp,sizeof(dfloat));
-  matrixTranspose(Np, Nfp*Nfaces, sM, Nfp*Nfaces, sMT, Np);
-
-  dfloat *ST = (dfloat*) calloc(3*Np*Np, sizeof(dfloat));
-  dfloat *SrrT = ST + 0*Np*Np;
-  dfloat *SrsT = ST + 1*Np*Np;
-  dfloat *SssT = ST + 2*Np*Np;
-  matrixTranspose(Np, Np, Srr, Np, SrrT, Np);
-  matrixTranspose(Np, Np, Srs, Np, SrsT, Np);
-  matrixTranspose(Np, Np, Sss, Np, SssT, Np);
-
-  o_D = platform.malloc(2*Np*Np*sizeof(dfloat), DT);
-  o_MM = platform.malloc(Np*Np*sizeof(dfloat), MM); //MM is symmetric
-
-  o_sM = platform.malloc(Np*Nfaces*Nfp*sizeof(dfloat), sMT);
-
-  o_LIFT = platform.malloc(Np*Nfaces*Nfp*sizeof(dfloat), LIFTT);
-
-  o_S = platform.malloc(3*Np*Np*sizeof(dfloat), ST);
-
-  o_vgeo = platform.malloc((Nelements+totalHaloPairs)*Nvgeo*sizeof(dfloat), vgeo);
-  o_sgeo = platform.malloc(Nelements*Nfaces*Nsgeo*sizeof(dfloat), sgeo);
-  o_ggeo = platform.malloc(Nelements*Nggeo*sizeof(dfloat), ggeo);
-
-  free(DT);
-  free(LIFTT);
-  free(sMT);
-  free(ST);
-}
diff --git a/libs/mesh/meshOccaSetupTri3D.cpp b/libs/mesh/meshOccaSetupTri3D.cpp
deleted file mode 100644
index 4985b3089..000000000
--- a/libs/mesh/meshOccaSetupTri3D.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
-
-  The MIT License (MIT)
-
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in all
-  copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-  SOFTWARE.
-
-*/
-
-#include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
-
-void meshTri3D::OccaSetup(){
-
-  this->mesh3D::OccaSetup();
-
-  // build transposes (we hold matrices as column major on device)
-  dfloat *DT = (dfloat*) calloc(2*Np*Np, sizeof(dfloat));
-  dfloat *DrT = DT + 0*Np*Np;
-  dfloat *DsT = DT + 1*Np*Np;
-  matrixTranspose(Np, Np, Dr, Np, DrT, Np);
-  matrixTranspose(Np, Np, Ds, Np, DsT, Np);
-
-  dfloat *LIFTT = (dfloat*) calloc(Np*Nfaces*Nfp, sizeof(dfloat));
-  matrixTranspose(Np, Nfp*Nfaces, LIFT, Nfp*Nfaces, LIFTT, Np);
-
-  dfloat *sMT = (dfloat *) calloc(Np*Nfaces*Nfp,sizeof(dfloat));
-  matrixTranspose(Np, Nfp*Nfaces, sM, Nfp*Nfaces, sMT, Np);
-
-  dfloat *ST = (dfloat*) calloc(3*Np*Np, sizeof(dfloat));
-  dfloat *SrrT = ST + 0*Np*Np;
-  dfloat *SrsT = ST + 1*Np*Np;
-  dfloat *SssT = ST + 2*Np*Np;
-  matrixTranspose(Np, Np, Srr, Np, SrrT, Np);
-  matrixTranspose(Np, Np, Srs, Np, SrsT, Np);
-  matrixTranspose(Np, Np, Sss, Np, SssT, Np);
-
-  o_D = platform.malloc(2*Np*Np*sizeof(dfloat), DT);
-  o_MM = platform.malloc(Np*Np*sizeof(dfloat), MM); //MM is symmetric
-
-  o_sM = platform.malloc(Np*Nfaces*Nfp*sizeof(dfloat), sMT);
-
-  o_LIFT = platform.malloc(Np*Nfaces*Nfp*sizeof(dfloat), LIFTT);
-
-  o_S = platform.malloc(3*Np*Np*sizeof(dfloat), ST);
-
-  o_vgeo = platform.malloc((Nelements+totalHaloPairs)*Nvgeo*sizeof(dfloat), vgeo);
-  o_sgeo = platform.malloc(Nelements*Nfaces*Nsgeo*sizeof(dfloat), sgeo);
-  o_ggeo = platform.malloc(Nelements*Nggeo*sizeof(dfloat), ggeo);
-
-  free(DT);
-  free(LIFTT);
-  free(sMT);
-  free(ST);
-}
diff --git a/libs/mesh/meshParallelConnectNodes.cpp b/libs/mesh/meshParallelConnectNodes.cpp
deleted file mode 100644
index 5343e4a2c..000000000
--- a/libs/mesh/meshParallelConnectNodes.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "mesh.hpp"
-
-
-// uniquely label each node with a global index, used for gatherScatter
-void mesh_t::ParallelConnectNodes(){
-
-  hlong localNodeCount = Np*Nelements;
-  hlong *allLocalNodeCounts = (hlong*) calloc(size, sizeof(hlong));
-
-  MPI_Allgather(&localNodeCount,    1, MPI_HLONG,
-                allLocalNodeCounts, 1, MPI_HLONG,
-                comm);
-
-  hlong gatherNodeStart = 0;
-  for(int rr=0;rr<rank;++rr)
-    gatherNodeStart += allLocalNodeCounts[rr];
-
-  free(allLocalNodeCounts);
-
-  // form continuous node numbering (local=>virtual gather)
-  int *baseRank = (int *) malloc((totalHaloPairs+Nelements)*Np*sizeof(int));
-  globalIds = (hlong *) malloc((totalHaloPairs+Nelements)*Np*sizeof(hlong));
-
-  // use local numbering
-  for(dlong e=0;e<Nelements;++e){
-    for(int n=0;n<Np;++n){
-      dlong id = e*Np+n;
-
-      baseRank[id] = rank;
-      globalIds[id] = 1 + id + Nnodes + gatherNodeStart;
-    }
-
-    // use vertex ids for vertex nodes to reduce iterations
-    for(int v=0;v<Nverts;++v){
-      hlong id = e*Np + vertexNodes[v];
-      hlong gid = EToV[e*Nverts+v] + 1;
-      globalIds[id] = gid;
-    }
-  }
-
-  hlong localChange = 0, gatherChange = 1;
-
-  // keep comparing numbers on positive and negative traces until convergence
-  while(gatherChange>0){
-
-    // reset change counter
-    localChange = 0;
-
-    // send halo data and recv into extension of buffer
-    halo->Exchange(baseRank, Np, ogs_int);
-    halo->Exchange(globalIds, Np, ogs_hlong);
-
-    // compare trace nodes
-    for(dlong e=0;e<Nelements;++e){
-      for(int n=0;n<Nfp*Nfaces;++n){
-        dlong id  = e*Nfp*Nfaces + n;
-        dlong idM = vmapM[id];
-        dlong idP = vmapP[id];
-        hlong gidM = globalIds[idM];
-        hlong gidP = globalIds[idP];
-
-        int baseRankM = baseRank[idM];
-        int baseRankP = baseRank[idP];
-
-        if(gidM<gidP || (gidP==gidM && baseRankM<baseRankP)){
-          ++localChange;
-          baseRank[idP]  = baseRank[idM];
-          globalIds[idP] = globalIds[idM];
-        }
-
-        if(gidP<gidM || (gidP==gidM && baseRankP<baseRankM)){
-          ++localChange;
-          baseRank[idM]  = baseRank[idP];
-          globalIds[idM] = globalIds[idP];
-        }
-      }
-    }
-
-    // sum up changes
-    MPI_Allreduce(&localChange, &gatherChange, 1, MPI_HLONG, MPI_MAX, comm);
-  }
-
-  free(baseRank);
-}
diff --git a/libs/mesh/meshParallelConnectOpt.cpp b/libs/mesh/meshParallelConnectOpt.cpp
deleted file mode 100644
index 09999c9db..000000000
--- a/libs/mesh/meshParallelConnectOpt.cpp
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "mesh.hpp"
-
-typedef struct {
-  hlong v[4]; // vertices on face
-  dlong element, elementN;
-  int face, rank;    // face info
-  int faceN, rankN; // N for neighbor face info
-
-}parallelFace_t;
-
-// mesh is the local partition
-void mesh_t::ParallelConnect(){
-
-  // serial connectivity on each process
-  this->Connect();
-
-  // count # of elements to send to each rank based on
-  // minimum {vertex id % size}
-  int *Nsend = (int*) calloc(size, sizeof(int));
-  int *Nrecv = (int*) calloc(size, sizeof(int));
-  int *sendOffsets = (int*) calloc(size, sizeof(int));
-  int *recvOffsets = (int*) calloc(size, sizeof(int));
-
-  // WARNING: In some corner cases, the number of faces to send may overrun int storage
-  int allNsend = 0;
-  for(dlong e=0;e<Nelements;++e){
-    for(int f=0;f<Nfaces;++f){
-      if(EToE[e*Nfaces+f]==-1){
-        // find rank of destination for sorting based on max(face vertices)%size
-        hlong maxv = 0;
-        for(int n=0;n<NfaceVertices;++n){
-          int nid = faceVertices[f*NfaceVertices+n];
-          dlong id = EToV[e*Nverts + nid];
-          maxv = mymax(maxv, id);
-        }
-        int destRank = (int) (maxv%size);
-
-        // increment send size for
-        ++Nsend[destRank];
-        ++allNsend;
-      }
-    }
-  }
-
-  // find send offsets
-  for(int rr=1;rr<size;++rr)
-    sendOffsets[rr] = sendOffsets[rr-1] + Nsend[rr-1];
-
-  // reset counters
-  for(int rr=0;rr<size;++rr)
-    Nsend[rr] = 0;
-
-  // buffer for outgoing data
-  parallelFace_t *sendFaces = (parallelFace_t*) calloc(allNsend, sizeof(parallelFace_t));
-
-  // Make the MPI_PARALLELFACE_T data type
-  MPI_Datatype MPI_PARALLELFACE_T;
-  MPI_Datatype dtype[7] = {MPI_HLONG, MPI_DLONG, MPI_DLONG, MPI_INT,
-                            MPI_INT, MPI_INT, MPI_INT};
-  int blength[7] = {4, 1, 1, 1, 1, 1, 1};
-  MPI_Aint addr[7], displ[7];
-  MPI_Get_address ( &(sendFaces[0]              ), addr+0);
-  MPI_Get_address ( &(sendFaces[0].element      ), addr+1);
-  MPI_Get_address ( &(sendFaces[0].elementN     ), addr+2);
-  MPI_Get_address ( &(sendFaces[0].face         ), addr+3);
-  MPI_Get_address ( &(sendFaces[0].rank         ), addr+4);
-  MPI_Get_address ( &(sendFaces[0].faceN        ), addr+5);
-  MPI_Get_address ( &(sendFaces[0].rankN        ), addr+6);
-  displ[0] = 0;
-  displ[1] = addr[1] - addr[0];
-  displ[2] = addr[2] - addr[0];
-  displ[3] = addr[3] - addr[0];
-  displ[4] = addr[4] - addr[0];
-  displ[5] = addr[5] - addr[0];
-  displ[6] = addr[6] - addr[0];
-  MPI_Type_create_struct (7, blength, displ, dtype, &MPI_PARALLELFACE_T);
-  MPI_Type_commit (&MPI_PARALLELFACE_T);
-
-  // pack face data
-  for(dlong e=0;e<Nelements;++e){
-    for(int f=0;f<Nfaces;++f){
-      if(EToE[e*Nfaces+f]==-1){
-
-        // find rank of destination for sorting based on max(face vertices)%size
-        hlong maxv = 0;
-        for(int n=0;n<NfaceVertices;++n){
-          int nid = faceVertices[f*NfaceVertices+n];
-          hlong id = EToV[e*Nverts + nid];
-          maxv = mymax(maxv, id);
-        }
-        int destRank = (int) (maxv%size);
-
-        // populate face to send out staged in segment of sendFaces array
-        int id = sendOffsets[destRank]+Nsend[destRank];
-
-
-        sendFaces[id].element = e;
-        sendFaces[id].face = f;
-        for(int n=0;n<NfaceVertices;++n){
-          int nid = faceVertices[f*NfaceVertices+n];
-          sendFaces[id].v[n] = EToV[e*Nverts + nid];
-        }
-
-        std::sort(sendFaces[id].v, sendFaces[id].v+NfaceVertices,
-                  std::less<hlong>());
-
-        sendFaces[id].rank = rank;
-
-        sendFaces[id].elementN = -1;
-        sendFaces[id].faceN = -1;
-        sendFaces[id].rankN = -1;
-
-        ++Nsend[destRank];
-      }
-    }
-  }
-
-  // exchange byte counts
-  MPI_Alltoall(Nsend, 1, MPI_INT,
-               Nrecv, 1, MPI_INT,
-               comm);
-
-  // count incoming faces
-  int allNrecv = 0;
-  for(int rr=0;rr<size;++rr)
-    allNrecv += Nrecv[rr];
-
-  // find offsets for recv data
-  for(int rr=1;rr<size;++rr)
-    recvOffsets[rr] = recvOffsets[rr-1] + Nrecv[rr-1]; // byte offsets
-
-  // buffer for incoming face data
-  parallelFace_t *recvFaces = (parallelFace_t*) calloc(allNrecv, sizeof(parallelFace_t));
-
-  // exchange parallel faces
-  MPI_Alltoallv(sendFaces, Nsend, sendOffsets, MPI_PARALLELFACE_T,
-                recvFaces, Nrecv, recvOffsets, MPI_PARALLELFACE_T,
-                comm);
-
-  // local sort allNrecv received faces
-  std::sort(recvFaces, recvFaces+allNrecv,
-            [&](const parallelFace_t& a, const parallelFace_t& b) {
-              return std::lexicographical_compare(a.v, a.v+NfaceVertices,
-                                                  b.v, b.v+NfaceVertices);
-            });
-
-  // find matches
-  for(int n=0;n<allNrecv-1;++n){
-    // since vertices are ordered we just look for pairs
-    if(std::equal(recvFaces[n].v, recvFaces[n].v+NfaceVertices,
-                  recvFaces[n+1].v)){
-      recvFaces[n].elementN = recvFaces[n+1].element;
-      recvFaces[n].faceN = recvFaces[n+1].face;
-      recvFaces[n].rankN = recvFaces[n+1].rank;
-
-      recvFaces[n+1].elementN = recvFaces[n].element;
-      recvFaces[n+1].faceN = recvFaces[n].face;
-      recvFaces[n+1].rankN = recvFaces[n].rank;
-    }
-  }
-
-  // sort back to original ordering
-  std::sort(recvFaces, recvFaces+allNrecv,
-            [](const parallelFace_t& a, const parallelFace_t& b) {
-              if(a.rank < b.rank) return true;
-              if(a.rank > b.rank) return false;
-
-              if(a.element < b.element) return true;
-              if(a.element > b.element) return false;
-
-              return (a.face < b.face);
-            });
-
-  // send faces back from whence they came
-  MPI_Alltoallv(recvFaces, Nrecv, recvOffsets, MPI_PARALLELFACE_T,
-                sendFaces, Nsend, sendOffsets, MPI_PARALLELFACE_T,
-                comm);
-
-  // extract connectivity info
-  EToP = (int*) calloc(Nelements*Nfaces, sizeof(int));
-  for(dlong cnt=0;cnt<Nelements*Nfaces;++cnt)
-    EToP[cnt] = -1;
-
-  for(int cnt=0;cnt<allNsend;++cnt){
-    dlong e = sendFaces[cnt].element;
-    dlong eN = sendFaces[cnt].elementN;
-    int f = sendFaces[cnt].face;
-    int fN = sendFaces[cnt].faceN;
-    int rN = sendFaces[cnt].rankN;
-
-    if(e>=0 && f>=0 && eN>=0 && fN>=0){
-      EToE[e*Nfaces+f] = eN;
-      EToF[e*Nfaces+f] = fN;
-      EToP[e*Nfaces+f] = rN;
-    }
-  }
-
-  MPI_Barrier(comm);
-  MPI_Type_free(&MPI_PARALLELFACE_T);
-  free(sendFaces);
-  free(recvFaces);
-
-  //record the number of elements in the whole mesh
-  hlong NelementsLocal = (hlong) Nelements;
-  NelementsGlobal = 0;
-  MPI_Allreduce(&NelementsLocal, &NelementsGlobal, 1, MPI_HLONG, MPI_SUM, comm);
-}
diff --git a/libs/mesh/meshParallelReaderQuad3D.cpp b/libs/mesh/meshParallelReaderQuad3D.cpp
deleted file mode 100644
index 963f4efab..000000000
--- a/libs/mesh/meshParallelReaderQuad3D.cpp
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
-
-/*
-   purpose: read gmsh quadrilateral mesh
-*/
-void meshQuad3D::ParallelReader(const char *fileName){
-
-  FILE *fp = fopen(fileName, "r");
-
-  dim = 3;
-  Nverts = 4; // number of vertices per element
-  Nfaces = 4;
-  NfaceVertices = 2;
-
-  int faceVertices_[4][2] = {{0,1},{1,2},{2,3},{3,0}};
-
-  faceVertices =
-    (int*) calloc(NfaceVertices*Nfaces, sizeof(int));
-
-  memcpy(faceVertices, faceVertices_[0], NfaceVertices*Nfaces*sizeof(int));
-
-  if(fp==NULL){
-    stringstream ss;
-    ss << "Cannot open file: " << fileName;
-    LIBP_ABORT(ss.str())
-  }
-
-  char buf[BUFSIZ];
-  do{
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
-  }while(!strstr(buf, "$Nodes"));
-
-  /* read number of nodes in mesh */
-  if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-    stringstream ss;
-    ss << "Error reading mesh file: " << fileName;
-    LIBP_ABORT(ss.str())
-  }
-  sscanf(buf, hlongFormat, &(Nnodes));
-
-  /* allocate space for node coordinates */
-  dfloat *VX = (dfloat*) calloc(Nnodes, sizeof(dfloat));
-  dfloat *VY = (dfloat*) calloc(Nnodes, sizeof(dfloat));
-  dfloat *VZ = (dfloat*) calloc(Nnodes, sizeof(dfloat));
-
-  /* load nodes */
-  for(int n=0;n<Nnodes;++n){
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
-    sscanf(buf, "%*d" dfloatFormat dfloatFormat dfloatFormat,
-	   VX+n, VY+n, VZ+n);
-  }
-
-  /* look for section with Element node data */
-  do{
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
-  }while(!strstr(buf, "$Elements"));
-
-  /* read number of nodes in mesh */
-  hlong gNelements;
-  if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-    stringstream ss;
-    ss << "Error reading mesh file: " << fileName;
-    LIBP_ABORT(ss.str())
-  }
-  sscanf(buf, hlongFormat, &(gNelements));
-
-  /* find # of quadrilaterals */
-  fpos_t fpos;
-  fgetpos(fp, &fpos);
-  hlong Nquadrilaterals = 0;
-
-  hlong gNboundaryFaces = 0;
-  for(int n=0;n<gNelements;++n){
-    int ElementType;
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
-    sscanf(buf, "%*d%d", &ElementType);
-    if(ElementType==1) ++gNboundaryFaces;
-    if(ElementType==3) ++Nquadrilaterals;
-  }
-  // rewind to start of elements
-  fsetpos(fp, &fpos);
-
-  int chunk = Nquadrilaterals/size;
-  int remainder = Nquadrilaterals - chunk*size;
-
-  int NquadrilateralsLocal = chunk + (rank<remainder);
-
-  /* where do these elements start ? */
-  int start = rank*chunk + mymin(rank, remainder);
-  int end = start + NquadrilateralsLocal-1;
-
-  /* allocate space for Element node index data */
-
-  EToV
-    = (hlong*) calloc(NquadrilateralsLocal*Nverts,
-		     sizeof(hlong));
-
-  elementInfo
-    = (hlong*) calloc(NquadrilateralsLocal,sizeof(hlong));
-
-  /* scan through file looking for quadrilateral elements */
-  int cnt=0, bcnt=0;
-  Nquadrilaterals = 0;
-
-  boundaryInfo = (hlong*) calloc(gNboundaryFaces*3, sizeof(hlong));
-  for(int n=0;n<gNelements;++n){
-    int ElementType;
-    hlong v1, v2, v3, v4;
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
-    sscanf(buf, "%*d%d", &ElementType);
-
-    if(ElementType==1){ // boundary face
-      sscanf(buf, "%*d%*d %*d" hlongFormat "%*d " hlongFormat hlongFormat,
-	     boundaryInfo+bcnt*3, &v1, &v2);
-      boundaryInfo[bcnt*3+1] = v1-1;
-      boundaryInfo[bcnt*3+2] = v2-1;
-      ++bcnt;
-    }
-
-    if(ElementType==3){  // quadrilateral
-      if(start<=Nquadrilaterals && Nquadrilaterals<=end){
-        sscanf(buf, "%*d%*d%*d " hlongFormat " %*d" hlongFormat hlongFormat hlongFormat hlongFormat,
-               elementInfo+cnt, &v1, &v2, &v3, &v4);
-
-#if 0
-	// check orientation
-	dfloat xe1 = VX[v1-1], xe2 = VX[v2-1], xe4 = VX[v4-1];
-	dfloat ye1 = VY[v1-1], ye2 = VY[v2-1], ye4 = VY[v4-1];
-	dfloat J = 0.25*((xe2-xe1)*(ye4-ye1) - (xe4-xe1)*(ye2-ye1));
-	if(J<0){
-	  int v4tmp = v4;
-	  v4 = v2;
-	  v2 = v4tmp;
-	  printf("unwarping element\n");
-	}
-#endif
-
-	/* read vertex triplet for trianngle */
-	EToV[cnt*Nverts+0] = v1-1;
-	EToV[cnt*Nverts+1] = v2-1;
-	EToV[cnt*Nverts+2] = v3-1;
-	EToV[cnt*Nverts+3] = v4-1;
-	++cnt;
-      }
-      ++Nquadrilaterals;
-    }
-  }
-  fclose(fp);
-
-  /* record number of boundary faces found */
-  NboundaryFaces = bcnt;
-
-  /* record number of found quadrilaterals */
-  Nelements = NquadrilateralsLocal;
-
-  /* collect vertices for each element */
-  EX = (dfloat*) calloc(Nverts*Nelements, sizeof(dfloat));
-  EY = (dfloat*) calloc(Nverts*Nelements, sizeof(dfloat));
-  EZ = (dfloat*) calloc(Nverts*Nelements, sizeof(dfloat));
-  for(int e=0;e<Nelements;++e){
-    for(int n=0;n<Nverts;++n){
-      EX[e*Nverts+n] = VX[EToV[e*Nverts+n]];
-      EY[e*Nverts+n] = VY[EToV[e*Nverts+n]];
-      EZ[e*Nverts+n] = VZ[EToV[e*Nverts+n]];
-#if 0
-      printf("e %d v %d %g %g %g\n",
-	     e, n,
-	     EX[e*Nverts+n],
-	     EY[e*Nverts+n],
-	     EZ[e*Nverts+n]);
-#endif
-    }
-  }
-
-  /* release VX and VY (these are too big to keep) */
-  free(VX);
-  free(VY);
-  free(VZ);
-}
-
diff --git a/libs/mesh/meshParallelReaderTri3D.cpp b/libs/mesh/meshParallelReaderTri3D.cpp
deleted file mode 100644
index 5287694eb..000000000
--- a/libs/mesh/meshParallelReaderTri3D.cpp
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
-
-/*
-   purpose: read gmsh triangle mesh
-*/
-void meshTri3D::ParallelReader(const char *fileName){
-
-  FILE *fp = fopen(fileName, "r");
-
-  dim = 3;
-  Nverts = 3; // number of vertices per element
-  Nfaces = 3;
-  NfaceVertices = 2;
-
-  /* vertices on each face */
-  int faceVertices_[4][2] = {{0,1},{1,2},{2,0}};
-
-  faceVertices =
-    (int*) calloc(NfaceVertices*Nfaces, sizeof(int));
-
-  memcpy(faceVertices, faceVertices_[0], NfaceVertices*Nfaces*sizeof(int));
-
-  if(fp==NULL){
-    stringstream ss;
-    ss << "Cannot open file: " << fileName;
-    LIBP_ABORT(ss.str())
-  }
-
-  char buf[BUFSIZ];
-
-
-  // look for Nodes section
-  do{
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
-  }while(!strstr(buf, "$Nodes"));
-
-  /* read number of nodes in mesh */
-  if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-    stringstream ss;
-    ss << "Error reading mesh file: " << fileName;
-    LIBP_ABORT(ss.str())
-  }
-  sscanf(buf, hlongFormat, &(Nnodes));
-
-  /* allocate space for node coordinates */
-  dfloat *VX = (dfloat*) calloc(Nnodes, sizeof(dfloat));
-  dfloat *VY = (dfloat*) calloc(Nnodes, sizeof(dfloat));
-  dfloat *VZ = (dfloat*) calloc(Nnodes, sizeof(dfloat));
-
-  /* load nodes */
-  for(int n=0;n<Nnodes;++n){
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
-    sscanf(buf, "%*d" dfloatFormat dfloatFormat dfloatFormat,
-	   VX+n, VY+n, VZ+n);
-  }
-
-  /* look for section with Element node data */
-  do{
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
-  }while(!strstr(buf, "$Elements"));
-
-  /* read number of nodes in mesh */
-  hlong gNelements;
-  if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-    stringstream ss;
-    ss << "Error reading mesh file: " << fileName;
-    LIBP_ABORT(ss.str())
-  }
-  sscanf(buf, hlongFormat, &(gNelements));
-
-  /* find # of triangles */
-  fpos_t fpos;
-  fgetpos(fp, &fpos);
-  hlong Ntriangles = 0;
-  hlong gNboundaryFaces = 0;
-  for(int n=0;n<gNelements;++n){
-    int ElementType;
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
-    sscanf(buf, "%*d%d", &ElementType);
-    if(ElementType==1) ++gNboundaryFaces;
-    if(ElementType==2) ++Ntriangles;
-  }
-  // rewind to start of elements
-  fsetpos(fp, &fpos);
-
-  int chunk = Ntriangles/size;
-  int remainder = Ntriangles - chunk*size;
-
-  int NtrianglesLocal = chunk + (rank<remainder);
-
-  /* where do these elements start ? */
-  int start = rank*chunk + mymin(rank, remainder);
-  int end = start + NtrianglesLocal-1;
-
-  /* allocate space for Element node index data */
-
-  EToV
-    = (hlong*) calloc(NtrianglesLocal*Nverts,
-		     sizeof(hlong));
-  elementInfo
-    = (hlong*) calloc(NtrianglesLocal,sizeof(hlong));
-
-  /* scan through file looking for triangle elements */
-  int cnt=0, bcnt=0;
-  Ntriangles = 0;
-
-  boundaryInfo = (hlong*) calloc(gNboundaryFaces*3, sizeof(hlong));
-  for(int n=0;n<gNelements;++n){
-    int ElementType, v1, v2, v3;
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
-    sscanf(buf, "%*d%d", &ElementType);
-    if(ElementType==1){ // boundary face
-      sscanf(buf, "%*d%*d %*d" hlongFormat "%*d %d%d",
-	     boundaryInfo+bcnt*3, &v1, &v2);
-      boundaryInfo[bcnt*3+1] = v1-1;
-      boundaryInfo[bcnt*3+2] = v2-1;
-      ++bcnt;
-    }
-    if(ElementType==2){  // triangle
-      if(start<=Ntriangles && Ntriangles<=end){
-	sscanf(buf, "%*d%*d%*d " hlongFormat " %*d %d%d%d",
-	      elementInfo+cnt, &v1, &v2, &v3);
-
-	// check orientation
-	// dfloat xe1 = VX[v1-1], xe2 = VX[v2-1], xe3 = VX[v3-1];
-	// dfloat ye1 = VY[v1-1], ye2 = VY[v2-1], ye3 = VY[v3-1];
-	// dfloat ze1 = VZ[v1-1], ze2 = VZ[v2-1], ze3 = VZ[v3-1];
-
-#if 0
-	// TW: no idea
-	dfloat J = 0.25*((xe2-xe1)*(ye3-ye1) - (xe3-xe1)*(ye2-ye1));
-	if(J<0){
-	  int v3tmp = v3;
-	  v3 = v2;
-	  v2 = v3tmp;
-	  //	  printf("unwarping element\n");
-	}
-#endif
-
-	/* read vertex triplet for trianngle */
-	EToV[cnt*Nverts+0] = v1-1;
-	EToV[cnt*Nverts+1] = v2-1;
-	EToV[cnt*Nverts+2] = v3-1;
-
-	++cnt;
-      }
-      ++Ntriangles;
-    }
-  }
-  fclose(fp);
-
-  /* record number of boundary faces found */
-  NboundaryFaces = bcnt;
-
-  /* record number of found triangles */
-  Nelements = NtrianglesLocal;
-
-  /* collect vertices for each element */
-  EX = (dfloat*) calloc(Nverts*Nelements, sizeof(dfloat));
-  EY = (dfloat*) calloc(Nverts*Nelements, sizeof(dfloat));
-  EZ = (dfloat*) calloc(Nverts*Nelements, sizeof(dfloat));
-  for(int e=0;e<Nelements;++e){
-    for(int n=0;n<Nverts;++n){
-      EX[e*Nverts+n] = VX[EToV[e*Nverts+n]];
-      EY[e*Nverts+n] = VY[EToV[e*Nverts+n]];
-      EZ[e*Nverts+n] = VZ[EToV[e*Nverts+n]];
-    }
-  }
-
-  /* release VX and VY (these are too big to keep) */
-  free(VX);
-  free(VY);
-  free(VZ);
-
-}
-
diff --git a/libs/mesh/meshPartition.cpp b/libs/mesh/meshPartition.cpp
new file mode 100644
index 000000000..debd565e1
--- /dev/null
+++ b/libs/mesh/meshPartition.cpp
@@ -0,0 +1,51 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "mesh.hpp"
+#include "parAdogs.hpp"
+
+namespace libp {
+
+void mesh_t::Partition(){
+
+  paradogs::MeshPartition(platform,
+                          settings,
+                          Nelements,
+                          dim,
+                          Nverts,
+                          Nfaces,
+                          NfaceVertices,
+                          faceVertices,
+                          EToV,
+                          EToE,
+                          EToF,
+                          EX,
+                          EY,
+                          EZ,
+                          comm);
+}
+
+} //namespace libp
diff --git a/libs/mesh/meshPhysicalNodesHex3D.cpp b/libs/mesh/meshPhysicalNodesHex3D.cpp
index 832d86d74..86fecf784 100644
--- a/libs/mesh/meshPhysicalNodesHex3D.cpp
+++ b/libs/mesh/meshPhysicalNodesHex3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,15 +25,16 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
 
-void meshHex3D::PhysicalNodes(){
+namespace libp {
 
-  x = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat));
-  y = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat));
-  z = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat));
+void mesh_t::PhysicalNodesHex3D(){
 
-  dlong cnt = 0;
+  x.malloc(Nelements*Np);
+  y.malloc(Nelements*Np);
+  z.malloc(Nelements*Np);
+
+  #pragma omp parallel for
   for(dlong e=0;e<Nelements;++e){ /* for each element */
 
     dlong id = e*Nverts;
@@ -73,7 +74,7 @@ void meshHex3D::PhysicalNodes(){
       dfloat tn = t[n];
 
       /* physical coordinate of interpolation node */
-      x[cnt] =
+      x[e*Np + n] =
         +0.125*(1-rn)*(1-sn)*(1-tn)*xe1
         +0.125*(1+rn)*(1-sn)*(1-tn)*xe2
         +0.125*(1+rn)*(1+sn)*(1-tn)*xe3
@@ -83,7 +84,7 @@ void meshHex3D::PhysicalNodes(){
         +0.125*(1+rn)*(1+sn)*(1+tn)*xe7
         +0.125*(1-rn)*(1+sn)*(1+tn)*xe8;
 
-      y[cnt] =
+      y[e*Np + n] =
         +0.125*(1-rn)*(1-sn)*(1-tn)*ye1
         +0.125*(1+rn)*(1-sn)*(1-tn)*ye2
         +0.125*(1+rn)*(1+sn)*(1-tn)*ye3
@@ -93,7 +94,7 @@ void meshHex3D::PhysicalNodes(){
         +0.125*(1+rn)*(1+sn)*(1+tn)*ye7
         +0.125*(1-rn)*(1+sn)*(1+tn)*ye8;
 
-      z[cnt] =
+      z[e*Np + n] =
         +0.125*(1-rn)*(1-sn)*(1-tn)*ze1
         +0.125*(1+rn)*(1-sn)*(1-tn)*ze2
         +0.125*(1+rn)*(1+sn)*(1-tn)*ze3
@@ -102,12 +103,12 @@ void meshHex3D::PhysicalNodes(){
         +0.125*(1+rn)*(1-sn)*(1+tn)*ze6
         +0.125*(1+rn)*(1+sn)*(1+tn)*ze7
         +0.125*(1-rn)*(1+sn)*(1+tn)*ze8;
-
-      ++cnt;
     }
   }
 
-  halo->Exchange(x, Np, ogs_dfloat);
-  halo->Exchange(y, Np, ogs_dfloat);
-  halo->Exchange(z, Np, ogs_dfloat);
+  o_x = platform.malloc<dfloat>(x);
+  o_y = platform.malloc<dfloat>(y);
+  o_z = platform.malloc<dfloat>(z);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshPhysicalNodesQuad2D.cpp b/libs/mesh/meshPhysicalNodesQuad2D.cpp
index 82a05210c..af6fc3b25 100644
--- a/libs/mesh/meshPhysicalNodesQuad2D.cpp
+++ b/libs/mesh/meshPhysicalNodesQuad2D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,15 +25,15 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
 
-void meshQuad2D::PhysicalNodes(){
+namespace libp {
 
-  x = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat));
-  y = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat));
-  z = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat));
+void mesh_t::PhysicalNodesQuad2D(){
 
-  dlong cnt = 0;
+  x.malloc(Nelements*Np);
+  y.malloc(Nelements*Np);
+
+  #pragma omp parallel for
   for(dlong e=0;e<Nelements;++e){ /* for each element */
 
     dlong id = e*Nverts;
@@ -55,22 +55,22 @@ void meshQuad2D::PhysicalNodes(){
       dfloat sn = s[n];
 
       /* physical coordinate of interpolation node */
-      x[cnt] =
+      x[e*Np + n] =
         +0.25*(1-rn)*(1-sn)*xe1
         +0.25*(1+rn)*(1-sn)*xe2
         +0.25*(1+rn)*(1+sn)*xe3
         +0.25*(1-rn)*(1+sn)*xe4;
 
-      y[cnt] =
+      y[e*Np + n] =
         +0.25*(1-rn)*(1-sn)*ye1
         +0.25*(1+rn)*(1-sn)*ye2
         +0.25*(1+rn)*(1+sn)*ye3
         +0.25*(1-rn)*(1+sn)*ye4;
-
-      ++cnt;
     }
   }
 
-  halo->Exchange(x, Np, ogs_dfloat);
-  halo->Exchange(y, Np, ogs_dfloat);
+  o_x = platform.malloc<dfloat>(x);
+  o_y = platform.malloc<dfloat>(y);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshPhysicalNodesQuad3D.cpp b/libs/mesh/meshPhysicalNodesQuad3D.cpp
index 295ce6308..c6785a8ea 100644
--- a/libs/mesh/meshPhysicalNodesQuad3D.cpp
+++ b/libs/mesh/meshPhysicalNodesQuad3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,15 +25,16 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
 
-void meshQuad3D::PhysicalNodes(){
+namespace libp {
 
-  x = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat));
-  y = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat));
-  z = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat));
+void mesh_t::PhysicalNodesQuad3D(){
 
-  int cnt = 0;
+  x.malloc(Nelements*Np);
+  y.malloc(Nelements*Np);
+  z.malloc(Nelements*Np);
+
+  #pragma omp parallel for
   for(int e=0;e<Nelements;++e){ /* for each element */
 
     int id = e*Nverts;
@@ -83,15 +84,15 @@ void meshQuad3D::PhysicalNodes(){
 
       // project to sphere
       dfloat rlin = sqrt(xlin*xlin+ylin*ylin+zlin*zlin);
-      x[cnt] = xlin/rlin;
-      y[cnt] = ylin/rlin;
-      z[cnt] = zlin/rlin;
-
-      ++cnt;
+      x[e*Np+n] = xlin/rlin;
+      y[e*Np+n] = ylin/rlin;
+      z[e*Np+n] = zlin/rlin;
     }
   }
 
-  halo->Exchange(x, Np, ogs_dfloat);
-  halo->Exchange(y, Np, ogs_dfloat);
-  halo->Exchange(z, Np, ogs_dfloat);
+  o_x = platform.malloc<dfloat>(x);
+  o_y = platform.malloc<dfloat>(y);
+  o_z = platform.malloc<dfloat>(z);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshPhysicalNodesTet3D.cpp b/libs/mesh/meshPhysicalNodesTet3D.cpp
index e9cc68443..5cc2d7d30 100644
--- a/libs/mesh/meshPhysicalNodesTet3D.cpp
+++ b/libs/mesh/meshPhysicalNodesTet3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,15 +25,16 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
 
-void meshTet3D::PhysicalNodes(){
+namespace libp {
 
-  x = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat));
-  y = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat));
-  z = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat));
+void mesh_t::PhysicalNodesTet3D(){
 
-  dlong cnt = 0;
+  x.malloc(Nelements*Np);
+  y.malloc(Nelements*Np);
+  z.malloc(Nelements*Np);
+
+  #pragma omp parallel for
   for(dlong e=0;e<Nelements;++e){ /* for each element */
 
     dlong id = e*Nverts;
@@ -61,14 +62,15 @@ void meshTet3D::PhysicalNodes(){
       dfloat tn = t[n];
 
       /* physical coordinate of interpolation node */
-      x[cnt] = -0.5*(1+rn+sn+tn)*xe1 + 0.5*(1+rn)*xe2 + 0.5*(1+sn)*xe3 + 0.5*(1+tn)*xe4;
-      y[cnt] = -0.5*(1+rn+sn+tn)*ye1 + 0.5*(1+rn)*ye2 + 0.5*(1+sn)*ye3 + 0.5*(1+tn)*ye4;
-      z[cnt] = -0.5*(1+rn+sn+tn)*ze1 + 0.5*(1+rn)*ze2 + 0.5*(1+sn)*ze3 + 0.5*(1+tn)*ze4;
-      ++cnt;
+      x[e*Np + n] = -0.5*(1+rn+sn+tn)*xe1 + 0.5*(1+rn)*xe2 + 0.5*(1+sn)*xe3 + 0.5*(1+tn)*xe4;
+      y[e*Np + n] = -0.5*(1+rn+sn+tn)*ye1 + 0.5*(1+rn)*ye2 + 0.5*(1+sn)*ye3 + 0.5*(1+tn)*ye4;
+      z[e*Np + n] = -0.5*(1+rn+sn+tn)*ze1 + 0.5*(1+rn)*ze2 + 0.5*(1+sn)*ze3 + 0.5*(1+tn)*ze4;
     }
   }
 
-  halo->Exchange(x, Np, ogs_dfloat);
-  halo->Exchange(y, Np, ogs_dfloat);
-  halo->Exchange(z, Np, ogs_dfloat);
+  o_x = platform.malloc<dfloat>(x);
+  o_y = platform.malloc<dfloat>(y);
+  o_z = platform.malloc<dfloat>(z);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshPhysicalNodesTri2D.cpp b/libs/mesh/meshPhysicalNodesTri2D.cpp
index b8877b1d0..39a2a4587 100644
--- a/libs/mesh/meshPhysicalNodesTri2D.cpp
+++ b/libs/mesh/meshPhysicalNodesTri2D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,15 +25,15 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
 
-void meshTri2D::PhysicalNodes(){
+namespace libp {
 
-  x = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat));
-  y = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat));
-  z = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat)); // dummy
+void mesh_t::PhysicalNodesTri2D(){
 
-  dlong cnt = 0;
+  x.malloc(Nelements*Np);
+  y.malloc(Nelements*Np);
+
+  #pragma omp parallel for
   for(dlong e=0;e<Nelements;++e){ /* for each element */
 
     dlong id = e*Nverts+0;
@@ -53,12 +53,13 @@ void meshTri2D::PhysicalNodes(){
       dfloat sn = s[n];
 
       /* physical coordinate of interpolation node */
-      x[cnt] = -0.5*(rn+sn)*xe1 + 0.5*(1+rn)*xe2 + 0.5*(1+sn)*xe3;
-      y[cnt] = -0.5*(rn+sn)*ye1 + 0.5*(1+rn)*ye2 + 0.5*(1+sn)*ye3;
-      ++cnt;
+      x[e*Np+n] = -0.5*(rn+sn)*xe1 + 0.5*(1+rn)*xe2 + 0.5*(1+sn)*xe3;
+      y[e*Np+n] = -0.5*(rn+sn)*ye1 + 0.5*(1+rn)*ye2 + 0.5*(1+sn)*ye3;
     }
   }
 
-  halo->Exchange(x, Np, ogs_dfloat);
-  halo->Exchange(y, Np, ogs_dfloat);
+  o_x = platform.malloc<dfloat>(x);
+  o_y = platform.malloc<dfloat>(y);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshPhysicalNodesTri3D.cpp b/libs/mesh/meshPhysicalNodesTri3D.cpp
index 720a74258..700f07ba4 100644
--- a/libs/mesh/meshPhysicalNodesTri3D.cpp
+++ b/libs/mesh/meshPhysicalNodesTri3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,15 +25,16 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
 
-void meshTri3D::PhysicalNodes(){
+namespace libp {
 
-  x = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat));
-  y = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat));
-  z = (dfloat*) calloc((Nelements+totalHaloPairs)*Np,sizeof(dfloat));
+void mesh_t::PhysicalNodesTri3D(){
 
-  int cnt = 0;
+  x.malloc(Nelements*Np);
+  y.malloc(Nelements*Np);
+  z.malloc(Nelements*Np);
+
+  #pragma omp parallel for
   for(int e=0;e<Nelements;++e){ /* for each element */
 
     int id = e*Nverts+0;
@@ -63,17 +64,17 @@ void meshTri3D::PhysicalNodes(){
 
       // project to sphere
       dfloat rlin = sqrt(xlin*xlin+ylin*ylin+zlin*zlin);
-      x[cnt] = xlin/rlin;
-      y[cnt] = ylin/rlin;
-      z[cnt] = zlin/rlin;
+      x[e*Np+n] = xlin/rlin;
+      y[e*Np+n] = ylin/rlin;
+      z[e*Np+n] = zlin/rlin;
 
       //      printf("x,y,z,rlin=%g,%g,%g,%g\n", xlin/rlin, ylin/rlin, zlin/rlin, rlin);
-      ++cnt;
-
     }
   }
 
-  halo->Exchange(x, Np, ogs_dfloat);
-  halo->Exchange(y, Np, ogs_dfloat);
-  halo->Exchange(z, Np, ogs_dfloat);
+  o_x = platform.malloc<dfloat>(x);
+  o_y = platform.malloc<dfloat>(y);
+  o_z = platform.malloc<dfloat>(z);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshPlotInterpHex3D.cpp b/libs/mesh/meshPlotInterpHex3D.cpp
index 6d064f002..a394accc9 100644
--- a/libs/mesh/meshPlotInterpHex3D.cpp
+++ b/libs/mesh/meshPlotInterpHex3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,24 +25,20 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
+
+namespace libp {
 
 //interpolate field to plotting nodes
-void meshHex3D::PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch){
-
-  dfloat *IQ, *IIQ;
-
-  bool alloc_scratch=false;
-  if (scratch==nullptr) {
-    //if not provided with a scratch space, alloc our own
-    alloc_scratch=true;
-    IQ  = (dfloat *) malloc(plotNq*Nq*Nq*sizeof(dfloat));
-    IIQ = (dfloat *) malloc(plotNq*plotNq*Nq*sizeof(dfloat));
-  } else {
-    IQ  = scratch;
-    IIQ = scratch + plotNq*Nq*Nq;
+void mesh_t::PlotInterpHex3D(const memory<dfloat> q, memory<dfloat> Iq, memory<dfloat> scratch){
+
+  if (scratch.length()< static_cast<size_t>(plotNq*Nq*Nq + plotNq*plotNq*Nq)) {
+    //if not provided with enough scratch space, alloc our own
+    scratch.malloc(plotNq*Nq*Nq + plotNq*plotNq*Nq);
   }
 
+  memory<dfloat> IQ  = scratch;
+  memory<dfloat> IIQ = scratch + plotNq*Nq*Nq;
+
   //interpolate in r
   for(int k=0;k<Nq;++k){
     for(int j=0;j<Nq;++j){
@@ -93,9 +89,6 @@ void meshHex3D::PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch){
       }
     }
   }
-
-  //clean up
-  if (alloc_scratch) {
-    free(IQ); free(IIQ);
-  }
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshPlotInterpQuad2D.cpp b/libs/mesh/meshPlotInterpQuad2D.cpp
index 54dd8432b..d87d25c83 100644
--- a/libs/mesh/meshPlotInterpQuad2D.cpp
+++ b/libs/mesh/meshPlotInterpQuad2D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,21 +25,17 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
 
-//interpolate field to plotting nodes
-void meshQuad2D::PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch){
+namespace libp {
 
-  dfloat *IQ;
+//interpolate field to plotting nodes
+void mesh_t::PlotInterpQuad2D(const memory<dfloat> q, memory<dfloat> Iq, memory<dfloat> scratch){
 
-  bool alloc_scratch=false;
-  if (scratch==nullptr) {
-    //if not provided with a scratch space, alloc our own
-    alloc_scratch=true;
-    IQ  = (dfloat *) malloc(plotNq*Nq*sizeof(dfloat));
-  } else {
-    IQ  = scratch;
+  if (scratch.length()< static_cast<size_t>(plotNq*Nq)) {
+    //if not provided with enough scratch space, alloc our own
+    scratch.malloc(plotNq*Nq);
   }
+  memory<dfloat> IQ  = scratch;
 
   //interpolate in r
   for(int j=0;j<Nq;++j){
@@ -70,9 +66,6 @@ void meshQuad2D::PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch){
       Iq[id] = qn;
     }
   }
-
-  //clean up
-  if (alloc_scratch) {
-    free(IQ);
-  }
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshPlotInterpQuad3D.cpp b/libs/mesh/meshPlotInterpQuad3D.cpp
deleted file mode 100644
index 13cef44ee..000000000
--- a/libs/mesh/meshPlotInterpQuad3D.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
-
-//interpolate field to plotting nodes
-void meshQuad3D::PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch){
-
-  dfloat *IQ;
-
-  bool alloc_scratch=false;
-  if (scratch==nullptr) {
-    //if not provided with a scratch space, alloc our own
-    alloc_scratch=true;
-    IQ  = (dfloat *) malloc(plotNq*Nq*sizeof(dfloat));
-  } else {
-    IQ  = scratch;
-  }
-
-  //interpolate in r
-  for(int j=0;j<Nq;++j){
-    for(int i=0;i<plotNq;++i){
-      dfloat qn = 0;
-
-      for(int m=0;m<Nq;++m){
-        const int qid = m + j*Nq;
-        qn += plotInterp[i*Nq+m]*q[qid];
-      }
-
-      const int id = i + j*plotNq;
-      IQ[id] = qn;
-    }
-  }
-
-  //interpolate in s
-  for(int j=0;j<plotNq;++j){
-    for(int i=0;i<plotNq;++i){
-      dfloat qn = 0;
-
-      for(int m=0;m<Nq;++m){
-        const int qid = i + m*plotNq;
-        qn += plotInterp[j*Nq+m]*IQ[qid];
-      }
-
-      const int id = i + j*plotNq;
-      Iq[id] = qn;
-    }
-  }
-
-  //clean up
-  if (alloc_scratch) {
-    free(IQ);
-  }
-}
diff --git a/libs/mesh/meshPlotInterpTet3D.cpp b/libs/mesh/meshPlotInterpTet3D.cpp
index 5194c9d41..b1676d866 100644
--- a/libs/mesh/meshPlotInterpTet3D.cpp
+++ b/libs/mesh/meshPlotInterpTet3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,10 +25,11 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
+
+namespace libp {
 
 //interpolate field to plotting nodes
-void meshTet3D::PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch){
+void mesh_t::PlotInterpTet3D(const memory<dfloat> q, memory<dfloat> Iq, memory<dfloat> scratch){
 
   //interpolate
   for(int n=0;n<plotNp;++n){
@@ -40,3 +41,5 @@ void meshTet3D::PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch){
     Iq[n] = qn;
   }
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshPlotInterpTri2D.cpp b/libs/mesh/meshPlotInterpTri2D.cpp
index 97851684e..82afd96cf 100644
--- a/libs/mesh/meshPlotInterpTri2D.cpp
+++ b/libs/mesh/meshPlotInterpTri2D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,10 +25,11 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
+
+namespace libp {
 
 //interpolate field to plotting nodes
-void meshTri2D::PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch){
+void mesh_t::PlotInterpTri2D(const memory<dfloat> q, memory<dfloat> Iq, memory<dfloat> scratch){
 
   //interpolate
   for(int n=0;n<plotNp;++n){
@@ -40,3 +41,5 @@ void meshTri2D::PlotInterp(const dfloat* q, dfloat* Iq, dfloat* scratch){
     Iq[n] = qn;
   }
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshPmlSetup.cpp b/libs/mesh/meshPmlSetup.cpp
index 3d15b7d90..fa04422c5 100644
--- a/libs/mesh/meshPmlSetup.cpp
+++ b/libs/mesh/meshPmlSetup.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,6 +26,8 @@ SOFTWARE.
 
 #include "mesh.hpp"
 
+namespace libp {
+
 void mesh_t::PmlSetup(){
 
   NnonPmlElements=0;
@@ -51,9 +53,9 @@ void mesh_t::PmlSetup(){
       NnonPmlElements++;
   }
 
-  nonPmlElements = (dlong *) malloc(NnonPmlElements*sizeof(dlong));
-  pmlElements    = (dlong *) malloc(NpmlElements*sizeof(dlong));
-  pmlIds         = (dlong *) malloc(NpmlElements*sizeof(dlong*));
+  nonPmlElements.malloc(NnonPmlElements);
+  pmlElements.malloc(NpmlElements);
+  pmlIds.malloc(NpmlElements);
 
   NnonPmlElements=0;
   NpmlElements=0;
@@ -70,20 +72,16 @@ void mesh_t::PmlSetup(){
       nonPmlElements[NnonPmlElements++] = e;
   }
 
-  if (NpmlElements) {
-    o_pmlElements = platform.malloc(NpmlElements*sizeof(dlong), pmlElements);
-    o_pmlIds = platform.malloc(NpmlElements*sizeof(dlong), pmlIds);
-  }
-
-  if (NnonPmlElements)
-    o_nonPmlElements = platform.malloc(NnonPmlElements*sizeof(dlong), nonPmlElements);
+  o_pmlElements = platform.malloc<dlong>(pmlElements);
+  o_pmlIds = platform.malloc<dlong>(pmlIds);
+  o_nonPmlElements = platform.malloc<dlong>(nonPmlElements);
 }
 
 
 void mesh_t::MultiRatePmlSetup(){
 
-  mrNnonPmlElements = (dlong *) calloc(mrNlevels,sizeof(dlong));
-  mrNpmlElements    = (dlong *) calloc(mrNlevels,sizeof(dlong));
+  mrNnonPmlElements.malloc(mrNlevels, 0);
+  mrNpmlElements.malloc(mrNlevels, 0);
 
   //count PML elements
   for (dlong e=0;e<Nelements;e++) {
@@ -106,13 +104,13 @@ void mesh_t::MultiRatePmlSetup(){
       for (int l=lev;l<mrNlevels;l++) mrNnonPmlElements[l]++;
   }
 
-  mrNonPmlElements = (dlong **) malloc(mrNlevels*sizeof(dlong*));
-  mrPmlElements    = (dlong **) malloc(mrNlevels*sizeof(dlong*));
-  mrPmlIds         = (dlong **) malloc(mrNlevels*sizeof(dlong*));
+  mrNonPmlElements.malloc(mrNlevels);
+  mrPmlElements.malloc(mrNlevels);
+  mrPmlIds.malloc(mrNlevels);
   for (int lev=0;lev<mrNlevels;lev++) {
-    mrNonPmlElements[lev] = (dlong *) malloc(mrNnonPmlElements[lev]*sizeof(dlong));
-    mrPmlElements[lev]    = (dlong *) malloc(mrNpmlElements[lev]*sizeof(dlong));
-    mrPmlIds[lev]         = (dlong *) malloc(mrNpmlElements[lev]*sizeof(dlong));
+    mrNonPmlElements[lev].malloc(mrNnonPmlElements[lev]);
+    mrPmlElements[lev].malloc(mrNpmlElements[lev]);
+    mrPmlIds[lev].malloc(mrNpmlElements[lev]);
 
     //reset
     mrNpmlElements[lev] = 0;
@@ -137,16 +135,15 @@ void mesh_t::MultiRatePmlSetup(){
         mrNonPmlElements[l][mrNnonPmlElements[l]++] = e;
   }
 
-  o_mrNonPmlElements = new occa::memory[mrNlevels];
-  o_mrPmlElements    = new occa::memory[mrNlevels];
-  o_mrPmlIds         = new occa::memory[mrNlevels];
+  o_mrNonPmlElements.malloc(mrNlevels);
+  o_mrPmlElements.malloc(mrNlevels);
+  o_mrPmlIds.malloc(mrNlevels);
 
   for (int lev=0;lev<mrNlevels;lev++){
-    if (mrNpmlElements[lev]) {
-      o_mrPmlElements[lev]   = platform.malloc(mrNpmlElements[lev]*sizeof(dlong), mrPmlElements[lev]);
-      o_mrPmlIds[lev] = platform.malloc(mrNpmlElements[lev]*sizeof(dlong), mrPmlIds[lev]);
-    }
-    if (mrNnonPmlElements[lev])
-      o_mrNonPmlElements[lev] = platform.malloc(mrNnonPmlElements[lev]*sizeof(dlong), mrNonPmlElements[lev]);
+    o_mrPmlElements[lev]   = platform.malloc<dlong>(mrPmlElements[lev]);
+    o_mrPmlIds[lev] = platform.malloc<dlong>(mrPmlIds[lev]);
+    o_mrNonPmlElements[lev] = platform.malloc<dlong>(mrNonPmlElements[lev]);
   }
-}
\ No newline at end of file
+}
+
+} //namespace libp
diff --git a/libs/mesh/meshParallelReaderHex3D.cpp b/libs/mesh/meshReadGmshHex3D.cpp
similarity index 60%
rename from libs/mesh/meshParallelReaderHex3D.cpp
rename to libs/mesh/meshReadGmshHex3D.cpp
index 498f797f8..ad09785fa 100644
--- a/libs/mesh/meshParallelReaderHex3D.cpp
+++ b/libs/mesh/meshReadGmshHex3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,84 +25,57 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
+
+namespace libp {
 
 /*
    purpose: read gmsh hexrahedra mesh
 */
-void meshHex3D::ParallelReader(const char *fileName){
-
-  FILE *fp = fopen(fileName, "r");
-
-  dim = 3;
-  Nverts = 8; // number of vertices per element
-  Nfaces = 6;
-  NfaceVertices = 4;
-
-  // vertices on each face
-  int _faceVertices[6][4] = {{0,1,2,3},{0,1,5,4},{1,2,6,5},{2,3,7,6},{3,0,4,7},{4,5,6,7}};
-
-  faceVertices =
-    (int*) calloc(NfaceVertices*Nfaces, sizeof(int));
+void mesh_t::ReadGmshHex3D(const std::string fileName){
 
-  memcpy(faceVertices, _faceVertices[0], NfaceVertices*Nfaces*sizeof(int));
-
-  if(fp==NULL){
-    stringstream ss;
-    ss << "Cannot open file: " << fileName;
-    LIBP_ABORT(ss.str())
-  }
+  FILE *fp = fopen(fileName.c_str(), "r");
+  LIBP_ABORT("Cannot open file: " << fileName,
+             fp==NULL);
 
   char buf[BUFSIZ];
   do{
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
   }while(!strstr(buf, "$Nodes"));
 
   /* read number of nodes in mesh */
-  if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-    stringstream ss;
-    ss << "Error reading mesh file: " << fileName;
-    LIBP_ABORT(ss.str())
-  }
+  //read to end of line
+  LIBP_ABORT("Error reading mesh file: " << fileName,
+             !fgets(buf, BUFSIZ, fp));
   sscanf(buf, hlongFormat, &(Nnodes));
 
   /* allocate space for node coordinates */
-  dfloat *VX = (dfloat*) calloc(Nnodes, sizeof(dfloat));
-  dfloat *VY = (dfloat*) calloc(Nnodes, sizeof(dfloat));
-  dfloat *VZ = (dfloat*) calloc(Nnodes, sizeof(dfloat));
+  memory<dfloat> VX(Nnodes);
+  memory<dfloat> VY(Nnodes);
+  memory<dfloat> VZ(Nnodes);
 
   /* load nodes */
   for(hlong n=0;n<Nnodes;++n){
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
     sscanf(buf, "%*d" dfloatFormat dfloatFormat dfloatFormat,
-           VX+n, VY+n, VZ+n);
-
+           VX.ptr()+n, VY.ptr()+n, VZ.ptr()+n);
   }
 
   /* look for section with Element node data */
   do{
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
   }while(!strstr(buf, "$Elements"));
 
   /* read number of nodes in mesh */
   hlong gNelements;
-  if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-    stringstream ss;
-    ss << "Error reading mesh file: " << fileName;
-    LIBP_ABORT(ss.str())
-  }
+  //read to end of line
+  LIBP_ABORT("Error reading mesh file: " << fileName,
+             !fgets(buf, BUFSIZ, fp));
   sscanf(buf, hlongFormat, &gNelements);
 
   /* find # of hexes */
@@ -112,11 +85,9 @@ void meshHex3D::ParallelReader(const char *fileName){
 
   for(hlong n=0;n<gNelements;++n){
     int ElementType;
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
     sscanf(buf, "%*d%d", &ElementType);
     if(ElementType==5) ++Nhexes; // hex code is 5
     if(ElementType==3) ++gNboundaryFaces; // quad codes is 3
@@ -130,35 +101,29 @@ void meshHex3D::ParallelReader(const char *fileName){
   hlong NhexesLocal = chunk + (rank<remainder);
 
   /* where do these elements start ? */
-  hlong start = rank*chunk + mymin(rank, remainder);
+  hlong start = rank*chunk + std::min(rank, remainder);
   hlong end = start + NhexesLocal-1;
 
   /* allocate space for Element node index data */
-
-  EToV
-    = (hlong*) calloc(NhexesLocal*Nverts, sizeof(hlong));
-
-  elementInfo
-    = (hlong*) calloc(NhexesLocal,sizeof(hlong));
+  EToV.malloc(NhexesLocal*Nverts);
+  elementInfo.malloc(NhexesLocal);
 
   /* scan through file looking for hexrahedra elements */
   hlong cnt=0, bcnt=0;
   Nhexes = 0;
 
-  boundaryInfo = (hlong*) calloc(gNboundaryFaces*(NfaceVertices+1), sizeof(hlong));
+  boundaryInfo.malloc(gNboundaryFaces*(NfaceVertices+1));
   for(hlong n=0;n<gNelements;++n){
     int ElementType;
     hlong v1, v2, v3, v4, v5, v6, v7, v8;
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
     sscanf(buf, "%*d%d", &ElementType);
 
     if(ElementType==3){ // quad boundary face
       sscanf(buf, "%*d%*d %*d" hlongFormat "%*d " hlongFormat hlongFormat hlongFormat hlongFormat,
-             boundaryInfo+bcnt*5, &v1, &v2, &v3, &v4);
+             boundaryInfo.ptr()+bcnt*5, &v1, &v2, &v3, &v4);
 
       boundaryInfo[bcnt*5+1] = v1-1;
       boundaryInfo[bcnt*5+2] = v2-1;
@@ -172,7 +137,7 @@ void meshHex3D::ParallelReader(const char *fileName){
         sscanf(buf,
                "%*d%*d%*d " hlongFormat " %*d"
                hlongFormat hlongFormat hlongFormat hlongFormat hlongFormat hlongFormat hlongFormat hlongFormat,
-               elementInfo+cnt,
+               elementInfo.ptr()+cnt,
                &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8);
 
         EToV[cnt*Nverts+0] = v1-1;
@@ -200,9 +165,9 @@ void meshHex3D::ParallelReader(const char *fileName){
   Nelements = (dlong) NhexesLocal;
 
   /* collect vertices for each element */
-  EX = (dfloat*) calloc(Nverts*Nelements, sizeof(dfloat));
-  EY = (dfloat*) calloc(Nverts*Nelements, sizeof(dfloat));
-  EZ = (dfloat*) calloc(Nverts*Nelements, sizeof(dfloat));
+  EX.malloc(Nverts*Nelements);
+  EY.malloc(Nverts*Nelements);
+  EZ.malloc(Nverts*Nelements);
   for(dlong e=0;e<Nelements;++e){
     for(int n=0;n<Nverts;++n){
       hlong vid = EToV[e*Nverts+n];
@@ -211,10 +176,6 @@ void meshHex3D::ParallelReader(const char *fileName){
       EZ[e*Nverts+n] = VZ[vid];
     }
   }
-
-  /* release VX and VY (these are too big to keep) */
-  free(VX);
-  free(VY);
-  free(VZ);
 }
 
+} //namespace libp
diff --git a/libs/mesh/meshParallelReaderQuad2D.cpp b/libs/mesh/meshReadGmshQuad2D.cpp
similarity index 60%
rename from libs/mesh/meshParallelReaderQuad2D.cpp
rename to libs/mesh/meshReadGmshQuad2D.cpp
index a7636df12..0af7cdf64 100644
--- a/libs/mesh/meshParallelReaderQuad2D.cpp
+++ b/libs/mesh/meshReadGmshQuad2D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,80 +25,55 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
+
+namespace libp {
 
 /*
    purpose: read gmsh quadrilateral mesh
 */
-void meshQuad2D::ParallelReader(const char *fileName){
-
-  FILE *fp = fopen(fileName, "r");
-
-  dim = 2;
-  Nverts = 4; // number of vertices per element
-  Nfaces = 4;
-  NfaceVertices = 2;
-
-  int faceVertices_[4][2] = {{0,1},{1,2},{2,3},{3,0}};
-
-  faceVertices =
-    (int*) calloc(NfaceVertices*Nfaces, sizeof(int));
+void mesh_t::ReadGmshQuad2D(const std::string fileName){
 
-  memcpy(faceVertices, faceVertices_[0], NfaceVertices*Nfaces*sizeof(int));
-
-  if(fp==NULL){
-    stringstream ss;
-    ss << "Cannot open file: " << fileName;
-    LIBP_ABORT(ss.str())
-  }
+  FILE *fp = fopen(fileName.c_str(), "r");
+  LIBP_ABORT("Cannot open file: " << fileName,
+             fp==NULL);
 
   char buf[BUFSIZ];
   do{
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
   }while(!strstr(buf, "$Nodes"));
 
   /* read number of nodes in mesh */
-  if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-    stringstream ss;
-    ss << "Error reading mesh file: " << fileName;
-    LIBP_ABORT(ss.str())
-  }
+  //read to end of line
+  LIBP_ABORT("Error reading mesh file: " << fileName,
+             !fgets(buf, BUFSIZ, fp));
   sscanf(buf, hlongFormat, &(Nnodes));
 
   /* allocate space for node coordinates */
-  dfloat *VX = (dfloat*) calloc(Nnodes, sizeof(dfloat));
-  dfloat *VY = (dfloat*) calloc(Nnodes, sizeof(dfloat));
+  memory<dfloat> VX(Nnodes);
+  memory<dfloat> VY(Nnodes);
 
   /* load nodes */
   for(hlong n=0;n<Nnodes;++n){
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
-    sscanf(buf, "%*d" dfloatFormat dfloatFormat, VX+n, VY+n);
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
+    sscanf(buf, "%*d" dfloatFormat dfloatFormat, VX.ptr()+n, VY.ptr()+n);
   }
 
   /* look for section with Element node data */
   do{
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
   }while(!strstr(buf, "$Elements"));
 
   /* read number of nodes in mesh */
   hlong gNelements;
-  if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-    stringstream ss;
-    ss << "Error reading mesh file: " << fileName;
-    LIBP_ABORT(ss.str())
-  }
+  //read to end of line
+  LIBP_ABORT("Error reading mesh file: " << fileName,
+             !fgets(buf, BUFSIZ, fp));
   sscanf(buf, hlongFormat, &gNelements);
 
   /* find # of quadrilaterals */
@@ -109,11 +84,9 @@ void meshQuad2D::ParallelReader(const char *fileName){
 
   for(hlong n=0;n<gNelements;++n){
     int ElementType;
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
     sscanf(buf, "%*d%d", &ElementType);
     if(ElementType==1) ++gNboundaryFaces;
     if(ElementType==3) ++Nquadrilaterals;
@@ -127,36 +100,29 @@ void meshQuad2D::ParallelReader(const char *fileName){
   hlong NquadrilateralsLocal = chunk + (rank<remainder);
 
   /* where do these elements start ? */
-  hlong start = rank*chunk + mymin(rank, remainder);
+  hlong start = rank*chunk + std::min(rank, remainder);
   hlong end = start + NquadrilateralsLocal-1;
 
   /* allocate space for Element node index data */
-
-  EToV
-    = (hlong*) calloc(NquadrilateralsLocal*Nverts,
-                     sizeof(hlong));
-
-  elementInfo
-    = (hlong*) calloc(NquadrilateralsLocal,sizeof(hlong));
+  EToV.malloc(NquadrilateralsLocal*Nverts);
+  elementInfo.malloc(NquadrilateralsLocal);
 
   /* scan through file looking for quadrilateral elements */
   hlong cnt=0, bcnt=0;
   Nquadrilaterals = 0;
 
-  boundaryInfo = (hlong*) calloc(gNboundaryFaces*3, sizeof(hlong));
+  boundaryInfo.malloc(gNboundaryFaces*3);
   for(hlong n=0;n<gNelements;++n){
     int ElementType;
     hlong v1, v2, v3, v4;
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
     sscanf(buf, "%*d%d", &ElementType);
 
     if(ElementType==1){ // boundary face
       sscanf(buf, "%*d%*d %*d" hlongFormat "%*d" hlongFormat hlongFormat,
-             boundaryInfo+bcnt*3, &v1, &v2);
+             boundaryInfo.ptr()+bcnt*3, &v1, &v2);
       boundaryInfo[bcnt*3+1] = v1-1;
       boundaryInfo[bcnt*3+2] = v2-1;
       ++bcnt;
@@ -165,7 +131,7 @@ void meshQuad2D::ParallelReader(const char *fileName){
     if(ElementType==3){  // quadrilateral
       if(start<=Nquadrilaterals && Nquadrilaterals<=end){
         sscanf(buf, "%*d%*d%*d " hlongFormat " %*d" hlongFormat hlongFormat hlongFormat hlongFormat,
-               elementInfo+cnt, &v1, &v2, &v3, &v4);
+               elementInfo.ptr()+cnt, &v1, &v2, &v3, &v4);
 
         // check orientation
         dfloat xe1 = VX[v1-1], xe2 = VX[v2-1], xe4 = VX[v4-1];
@@ -197,18 +163,14 @@ void meshQuad2D::ParallelReader(const char *fileName){
   Nelements = (dlong) NquadrilateralsLocal;
 
   /* collect vertices for each element */
-  EX = (dfloat*) calloc(Nverts*Nelements, sizeof(dfloat));
-  EY = (dfloat*) calloc(Nverts*Nelements, sizeof(dfloat));
+  EX.malloc(Nverts*Nelements);
+  EY.malloc(Nverts*Nelements);
   for(dlong e=0;e<Nelements;++e){
     for(int n=0;n<Nverts;++n){
       EX[e*Nverts+n] = VX[EToV[e*Nverts+n]];
       EY[e*Nverts+n] = VY[EToV[e*Nverts+n]];
     }
   }
-
-  /* release VX and VY (these are too big to keep) */
-  free(VX);
-  free(VY);
-
 }
 
+} //namespace libp
diff --git a/libs/mesh/meshReadGmshQuad3D.cpp b/libs/mesh/meshReadGmshQuad3D.cpp
new file mode 100644
index 000000000..b5fceb472
--- /dev/null
+++ b/libs/mesh/meshReadGmshQuad3D.cpp
@@ -0,0 +1,189 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "mesh.hpp"
+
+namespace libp {
+
+/*
+   purpose: read gmsh quadrilateral mesh
+*/
+void mesh_t::ReadGmshQuad3D(const std::string fileName){
+
+  FILE *fp = fopen(fileName.c_str(), "r");
+  LIBP_ABORT("Cannot open file: " << fileName,
+             fp==NULL);
+
+  char buf[BUFSIZ];
+  do{
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
+  }while(!strstr(buf, "$Nodes"));
+
+  /* read number of nodes in mesh */
+  //read to end of line
+  LIBP_ABORT("Error reading mesh file: " << fileName,
+             !fgets(buf, BUFSIZ, fp));
+  sscanf(buf, hlongFormat, &(Nnodes));
+
+  /* allocate space for node coordinates */
+  memory<dfloat> VX(Nnodes);
+  memory<dfloat> VY(Nnodes);
+  memory<dfloat> VZ(Nnodes);
+
+  /* load nodes */
+  for(int n=0;n<Nnodes;++n){
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
+    sscanf(buf, "%*d" dfloatFormat dfloatFormat dfloatFormat,
+           VX.ptr()+n, VY.ptr()+n, VZ.ptr()+n);
+  }
+
+  /* look for section with Element node data */
+  do{
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
+  }while(!strstr(buf, "$Elements"));
+
+  /* read number of nodes in mesh */
+  hlong gNelements;
+  //read to end of line
+  LIBP_ABORT("Error reading mesh file: " << fileName,
+             !fgets(buf, BUFSIZ, fp));
+  sscanf(buf, hlongFormat, &gNelements);
+
+  /* find # of quadrilaterals */
+  fpos_t fpos;
+  fgetpos(fp, &fpos);
+  hlong Nquadrilaterals = 0;
+  hlong gNboundaryFaces = 0;
+
+  for(hlong n=0;n<gNelements;++n){
+    int ElementType;
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
+    sscanf(buf, "%*d%d", &ElementType);
+    if(ElementType==1) ++gNboundaryFaces;
+    if(ElementType==3) ++Nquadrilaterals;
+  }
+  // rewind to start of elements
+  fsetpos(fp, &fpos);
+
+  hlong chunk = (hlong) Nquadrilaterals/size;
+  int remainder = (int) (Nquadrilaterals - chunk*size);
+
+  hlong NquadrilateralsLocal = chunk + (rank<remainder);
+
+  /* where do these elements start ? */
+  hlong start = rank*chunk + std::min(rank, remainder);
+  hlong end = start + NquadrilateralsLocal-1;
+
+  /* allocate space for Element node index data */
+  EToV.malloc(NquadrilateralsLocal*Nverts);
+  elementInfo.malloc(NquadrilateralsLocal);
+
+  /* scan through file looking for quadrilateral elements */
+  hlong cnt=0, bcnt=0;
+  Nquadrilaterals = 0;
+
+  boundaryInfo.malloc(gNboundaryFaces*3);
+  for(hlong n=0;n<gNelements;++n){
+    int ElementType;
+    hlong v1, v2, v3, v4;
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
+    sscanf(buf, "%*d%d", &ElementType);
+
+    if(ElementType==1){ // boundary face
+      sscanf(buf, "%*d%*d %*d" hlongFormat "%*d" hlongFormat hlongFormat,
+             boundaryInfo.ptr()+bcnt*3, &v1, &v2);
+      boundaryInfo[bcnt*3+1] = v1-1;
+      boundaryInfo[bcnt*3+2] = v2-1;
+      ++bcnt;
+    }
+
+    if(ElementType==3){  // quadrilateral
+      if(start<=Nquadrilaterals && Nquadrilaterals<=end){
+        sscanf(buf, "%*d%*d%*d " hlongFormat " %*d" hlongFormat hlongFormat hlongFormat hlongFormat,
+               elementInfo.ptr()+cnt, &v1, &v2, &v3, &v4);
+
+#if 0
+        // check orientation
+        dfloat xe1 = VX[v1-1], xe2 = VX[v2-1], xe4 = VX[v4-1];
+        dfloat ye1 = VY[v1-1], ye2 = VY[v2-1], ye4 = VY[v4-1];
+        dfloat J = 0.25*((xe2-xe1)*(ye4-ye1) - (xe4-xe1)*(ye2-ye1));
+        if(J<0){
+          int v4tmp = v4;
+          v4 = v2;
+          v2 = v4tmp;
+          printf("unwarping element\n");
+        }
+#endif
+
+        /* read vertex triplet for trianngle */
+        EToV[cnt*Nverts+0] = v1-1;
+        EToV[cnt*Nverts+1] = v2-1;
+        EToV[cnt*Nverts+2] = v3-1;
+        EToV[cnt*Nverts+3] = v4-1;
+        ++cnt;
+      }
+      ++Nquadrilaterals;
+    }
+  }
+  fclose(fp);
+
+  /* record number of boundary faces found */
+  NboundaryFaces = bcnt;
+
+  /* record number of found quadrilaterals */
+  Nelements = NquadrilateralsLocal;
+
+  /* collect vertices for each element */
+  EX.malloc(Nverts*Nelements);
+  EY.malloc(Nverts*Nelements);
+  EZ.malloc(Nverts*Nelements);
+  for(int e=0;e<Nelements;++e){
+    for(int n=0;n<Nverts;++n){
+      EX[e*Nverts+n] = VX[EToV[e*Nverts+n]];
+      EY[e*Nverts+n] = VY[EToV[e*Nverts+n]];
+      EZ[e*Nverts+n] = VZ[EToV[e*Nverts+n]];
+#if 0
+      printf("e %d v %d %g %g %g\n",
+             e, n,
+             EX[e*Nverts+n],
+             EY[e*Nverts+n],
+             EZ[e*Nverts+n]);
+#endif
+    }
+  }
+}
+
+} //namespace libp
diff --git a/libs/mesh/meshParallelReaderTet3D.cpp b/libs/mesh/meshReadGmshTet3D.cpp
similarity index 57%
rename from libs/mesh/meshParallelReaderTet3D.cpp
rename to libs/mesh/meshReadGmshTet3D.cpp
index 7075009f4..165151b77 100644
--- a/libs/mesh/meshParallelReaderTet3D.cpp
+++ b/libs/mesh/meshReadGmshTet3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,82 +25,57 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
+
+namespace libp {
 
 /*
    purpose: read gmsh tetrahedra mesh
 */
-void meshTet3D::ParallelReader(const char *fileName){
-
-  FILE *fp = fopen(fileName, "r");
-
-  dim = 3;
-  Nverts = 4; // number of vertices per element
-  Nfaces = 4;
-
-  // vertices on each face
-  int faceVertices_[4][3] = {{0,1,2},{0,1,3},{1,2,3},{2,0,3}};
+void mesh_t::ReadGmshTet3D(const std::string fileName){
 
-  NfaceVertices = 3;
-  faceVertices =
-    (int*) calloc(NfaceVertices*Nfaces, sizeof(int));
-  memcpy(faceVertices, faceVertices_[0], 12*sizeof(int));
-
-  if(fp==NULL){
-    stringstream ss;
-    ss << "Cannot open file: " << fileName;
-    LIBP_ABORT(ss.str())
-  }
+  FILE *fp = fopen(fileName.c_str(), "r");
+  LIBP_ABORT("Cannot open file: " << fileName,
+             fp==NULL);
 
   char buf[BUFSIZ];
   do{
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
   }while(!strstr(buf, "$Nodes"));
 
   /* read number of nodes in mesh */
-  if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-    stringstream ss;
-    ss << "Error reading mesh file: " << fileName;
-    LIBP_ABORT(ss.str())
-  }
+  //read to end of line
+  LIBP_ABORT("Error reading mesh file: " << fileName,
+             !fgets(buf, BUFSIZ, fp));
   sscanf(buf, hlongFormat, &(Nnodes));
 
   /* allocate space for node coordinates */
-  dfloat *VX = (dfloat*) calloc(Nnodes, sizeof(dfloat));
-  dfloat *VY = (dfloat*) calloc(Nnodes, sizeof(dfloat));
-  dfloat *VZ = (dfloat*) calloc(Nnodes, sizeof(dfloat));
+  memory<dfloat> VX(Nnodes);
+  memory<dfloat> VY(Nnodes);
+  memory<dfloat> VZ(Nnodes);
 
   /* load nodes */
   for(hlong n=0;n<Nnodes;++n){
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
     sscanf(buf, "%*d" dfloatFormat dfloatFormat dfloatFormat,
-           VX+n, VY+n, VZ+n);
+           VX.ptr()+n, VY.ptr()+n, VZ.ptr()+n);
   }
 
   /* look for section with Element node data */
   do{
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
   }while(!strstr(buf, "$Elements"));
 
   /* read number of nodes in mesh */
   hlong gNelements;
-  if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-    stringstream ss;
-    ss << "Error reading mesh file: " << fileName;
-    LIBP_ABORT(ss.str())
-  }
+  //read to end of line
+  LIBP_ABORT("Error reading mesh file: " << fileName,
+             !fgets(buf, BUFSIZ, fp));
   sscanf(buf, hlongFormat, &gNelements);
 
   /* find # of tets */
@@ -109,11 +84,9 @@ void meshTet3D::ParallelReader(const char *fileName){
   hlong Ntets = 0, gNboundaryFaces = 0;
   for(hlong n=0;n<gNelements;++n){
     int ElementType;
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
     sscanf(buf, "%*d%d", &ElementType);
     if(ElementType==4) ++Ntets; // tet code is 4
     if(ElementType==2) ++gNboundaryFaces;
@@ -127,33 +100,28 @@ void meshTet3D::ParallelReader(const char *fileName){
   hlong NtetsLocal = chunk + (rank<remainder);
 
   /* where do these elements start ? */
-  hlong start = rank*chunk + mymin(rank, remainder);
+  hlong start = rank*chunk + std::min(rank, remainder);
   hlong end = start + NtetsLocal-1;
 
   /* allocate space for Element node index data */
-
-  EToV
-    = (hlong*) calloc(NtetsLocal*Nverts, sizeof(hlong));
-  elementInfo
-    = (hlong*) calloc(NtetsLocal,sizeof(hlong));
+  EToV.malloc(NtetsLocal*Nverts);
+  elementInfo.malloc(NtetsLocal);
 
   /* scan through file looking for tetrahedra elements */
   hlong cnt=0, bcnt = 0;
   Ntets = 0;
 
-  boundaryInfo = (hlong*) calloc(gNboundaryFaces*4, sizeof(hlong));
+  boundaryInfo.malloc(gNboundaryFaces*4);
   for(hlong n=0;n<gNelements;++n){
     int ElementType;
     hlong v1, v2, v3, v4;
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
     sscanf(buf, "%*d%d", &ElementType);
     if(ElementType==2){ // boundary face
       sscanf(buf, "%*d%*d %*d" hlongFormat "%*d" hlongFormat hlongFormat hlongFormat,
-             boundaryInfo+bcnt*4, &v1, &v2, &v3);
+             boundaryInfo.ptr()+bcnt*4, &v1, &v2, &v3);
       boundaryInfo[bcnt*4+1] = v1-1;
       boundaryInfo[bcnt*4+2] = v2-1;
       boundaryInfo[bcnt*4+3] = v3-1;
@@ -165,7 +133,7 @@ void meshTet3D::ParallelReader(const char *fileName){
         sscanf(buf,
                "%*d%*d%*d " hlongFormat " %*d"
                hlongFormat hlongFormat hlongFormat hlongFormat,
-               elementInfo+cnt,&v1, &v2, &v3, &v4);
+               elementInfo.ptr()+cnt,&v1, &v2, &v3, &v4);
         /* read vertex triplet for trianngle */
         EToV[cnt*Nverts+0] = v1-1;
         EToV[cnt*Nverts+1] = v2-1;
@@ -185,9 +153,9 @@ void meshTet3D::ParallelReader(const char *fileName){
   Nelements = (dlong) NtetsLocal;
 
   /* collect vertices for each element */
-  EX = (dfloat*) calloc(Nverts*Nelements, sizeof(dfloat));
-  EY = (dfloat*) calloc(Nverts*Nelements, sizeof(dfloat));
-  EZ = (dfloat*) calloc(Nverts*Nelements, sizeof(dfloat));
+  EX.malloc(Nverts*Nelements);
+  EY.malloc(Nverts*Nelements);
+  EZ.malloc(Nverts*Nelements);
   for(dlong e=0;e<Nelements;++e){
     for(int n=0;n<Nverts;++n){
       hlong vid = EToV[e*Nverts+n];
@@ -196,11 +164,6 @@ void meshTet3D::ParallelReader(const char *fileName){
       EZ[e*Nverts+n] = VZ[vid];
     }
   }
-
-  /* release VX and VY (these are too big to keep) */
-  free(VX);
-  free(VY);
-  free(VZ);
-
 }
 
+} //namespace libp
diff --git a/libs/mesh/meshParallelReaderTri2D.cpp b/libs/mesh/meshReadGmshTri2D.cpp
similarity index 60%
rename from libs/mesh/meshParallelReaderTri2D.cpp
rename to libs/mesh/meshReadGmshTri2D.cpp
index 3d6040c6f..bb78b9066 100644
--- a/libs/mesh/meshParallelReaderTri2D.cpp
+++ b/libs/mesh/meshReadGmshTri2D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,84 +25,57 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
+
+namespace libp {
 
 /*
    purpose: read gmsh triangle mesh
 */
-void meshTri2D::ParallelReader(const char *fileName){
-
-  FILE *fp = fopen(fileName, "r");
-
-  dim = 2;
-  Nverts = 3; // number of vertices per element
-  Nfaces = 3;
-  NfaceVertices = 2;
-
-  /* vertices on each face */
-  int faceVertices_[4][2] = {{0,1},{1,2},{2,0}};
-
-  faceVertices =
-    (int*) calloc(NfaceVertices*Nfaces, sizeof(int));
+void mesh_t::ReadGmshTri2D(const std::string fileName){
 
-  memcpy(faceVertices, faceVertices_[0], NfaceVertices*Nfaces*sizeof(int));
-
-  if(fp==NULL){
-    stringstream ss;
-    ss << "Cannot open file: " << fileName;
-    LIBP_ABORT(ss.str())
-  }
+  FILE *fp = fopen(fileName.c_str(), "r");
+  LIBP_ABORT("Cannot open file: " << fileName,
+             fp==NULL);
 
   char buf[BUFSIZ];
 
-
   // look for Nodes section
   do{
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
   }while(!strstr(buf, "$Nodes"));
 
   /* read number of nodes in mesh */
-  if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-    stringstream ss;
-    ss << "Error reading mesh file: " << fileName;
-    LIBP_ABORT(ss.str())
-  }
+  //read to end of line
+  LIBP_ABORT("Error reading mesh file: " << fileName,
+             !fgets(buf, BUFSIZ, fp));
   sscanf(buf, hlongFormat, &(Nnodes));
 
   /* allocate space for node coordinates */
-  dfloat *VX = (dfloat*) calloc(Nnodes, sizeof(dfloat));
-  dfloat *VY = (dfloat*) calloc(Nnodes, sizeof(dfloat));
+  memory<dfloat> VX(Nnodes);
+  memory<dfloat> VY(Nnodes);
 
   /* load nodes */
   for(hlong n=0;n<Nnodes;++n){
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
-    sscanf(buf, "%*d" dfloatFormat dfloatFormat, VX+n, VY+n);
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
+    sscanf(buf, "%*d" dfloatFormat dfloatFormat, VX.ptr()+n, VY.ptr()+n);
   }
 
   /* look for section with Element node data */
   do{
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
   }while(!strstr(buf, "$Elements"));
 
   /* read number of elements in mesh */
   hlong gNelements;
-  if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-    stringstream ss;
-    ss << "Error reading mesh file: " << fileName;
-    LIBP_ABORT(ss.str())
-  }
+  //read to end of line
+  LIBP_ABORT("Error reading mesh file: " << fileName,
+             !fgets(buf, BUFSIZ, fp));
   sscanf(buf, hlongFormat, &gNelements);
 
   /* find # of triangles */
@@ -112,11 +85,9 @@ void meshTri2D::ParallelReader(const char *fileName){
   hlong gNboundaryFaces = 0;
   for(hlong n=0;n<gNelements;++n){
     int ElementType;
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
     sscanf(buf, "%*d%d", &ElementType);
     if(ElementType==1) ++gNboundaryFaces;
     if(ElementType==2) ++Ntriangles;
@@ -130,34 +101,28 @@ void meshTri2D::ParallelReader(const char *fileName){
   hlong NtrianglesLocal = chunk + (rank<remainder);
 
   /* where do these elements start ? */
-  hlong start = rank*chunk + mymin(rank, remainder);
+  hlong start = rank*chunk + std::min(rank, remainder);
   hlong end   = start + NtrianglesLocal-1;
 
   /* allocate space for Element node index data */
-
-  EToV
-    = (hlong*) calloc(NtrianglesLocal*Nverts,
-                     sizeof(hlong));
-  elementInfo
-    = (hlong*) calloc(NtrianglesLocal,sizeof(hlong));
+  EToV.malloc(NtrianglesLocal*Nverts);
+  elementInfo.malloc(NtrianglesLocal);
 
   /* scan through file looking for triangle elements */
   hlong cnt=0, bcnt=0;
   Ntriangles = 0;
 
-  boundaryInfo = (hlong*) calloc(gNboundaryFaces*3, sizeof(hlong));
+  boundaryInfo.malloc(gNboundaryFaces*3);
   for(hlong n=0;n<gNelements;++n){
     int ElementType;
     hlong v1, v2, v3;
-    if (!fgets(buf, BUFSIZ, fp)) { //read to end of line
-      stringstream ss;
-      ss << "Error reading mesh file: " << fileName;
-      LIBP_ABORT(ss.str())
-    }
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
     sscanf(buf, "%*d%d", &ElementType);
     if(ElementType==1){ // boundary face
       sscanf(buf, "%*d%*d %*d" hlongFormat "%*d" hlongFormat hlongFormat,
-             boundaryInfo+bcnt*3, &v1, &v2);
+             boundaryInfo.ptr()+bcnt*3, &v1, &v2);
       boundaryInfo[bcnt*3+1] = v1-1;
       boundaryInfo[bcnt*3+2] = v2-1;
       ++bcnt;
@@ -165,7 +130,7 @@ void meshTri2D::ParallelReader(const char *fileName){
     if(ElementType==2){  // triangle
       if(start<=Ntriangles && Ntriangles<=end){
         sscanf(buf, "%*d%*d%*d " hlongFormat " %*d" hlongFormat hlongFormat hlongFormat,
-               elementInfo+cnt, &v1, &v2, &v3);
+               elementInfo.ptr()+cnt, &v1, &v2, &v3);
 
         // check orientation
         dfloat xe1 = VX[v1-1], xe2 = VX[v2-1], xe3 = VX[v3-1];
@@ -197,17 +162,14 @@ void meshTri2D::ParallelReader(const char *fileName){
   Nelements = (dlong) NtrianglesLocal;
 
   /* collect vertices for each element */
-  EX = (dfloat*) calloc(Nverts*Nelements, sizeof(dfloat));
-  EY = (dfloat*) calloc(Nverts*Nelements, sizeof(dfloat));
+  EX.malloc(Nverts*Nelements);
+  EY.malloc(Nverts*Nelements);
   for(dlong e=0;e<Nelements;++e){
     for(int n=0;n<Nverts;++n){
       EX[e*Nverts+n] = VX[EToV[e*Nverts+n]];
       EY[e*Nverts+n] = VY[EToV[e*Nverts+n]];
     }
   }
-
-  /* release VX and VY (these are too big to keep) */
-  free(VX);
-  free(VY);
 }
 
+} //namespace libp
diff --git a/libs/mesh/meshReadGmshTri3D.cpp b/libs/mesh/meshReadGmshTri3D.cpp
new file mode 100644
index 000000000..00bdb8507
--- /dev/null
+++ b/libs/mesh/meshReadGmshTri3D.cpp
@@ -0,0 +1,184 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "mesh.hpp"
+
+namespace libp {
+
+/*
+   purpose: read gmsh triangle mesh
+*/
+void mesh_t::ReadGmshTri3D(const std::string fileName){
+
+  FILE *fp = fopen(fileName.c_str(), "r");
+  LIBP_ABORT("Cannot open file: " << fileName,
+             fp==NULL);
+
+  char buf[BUFSIZ];
+
+  // look for Nodes section
+  do{
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
+  }while(!strstr(buf, "$Nodes"));
+
+  /* read number of nodes in mesh */
+  //read to end of line
+  LIBP_ABORT("Error reading mesh file: " << fileName,
+             !fgets(buf, BUFSIZ, fp));
+  sscanf(buf, hlongFormat, &(Nnodes));
+
+  /* allocate space for node coordinates */
+  memory<dfloat> VX(Nnodes);
+  memory<dfloat> VY(Nnodes);
+  memory<dfloat> VZ(Nnodes);
+
+  /* load nodes */
+  for(int n=0;n<Nnodes;++n){
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
+    sscanf(buf, "%*d" dfloatFormat dfloatFormat dfloatFormat,
+           VX.ptr()+n, VY.ptr()+n, VZ.ptr()+n);
+  }
+
+  /* look for section with Element node data */
+  do{
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
+  }while(!strstr(buf, "$Elements"));
+
+  /* read number of elements in mesh */
+  hlong gNelements;
+  //read to end of line
+  LIBP_ABORT("Error reading mesh file: " << fileName,
+             !fgets(buf, BUFSIZ, fp));
+  sscanf(buf, hlongFormat, &gNelements);
+
+  /* find # of triangles */
+  fpos_t fpos;
+  fgetpos(fp, &fpos);
+  hlong Ntriangles = 0;
+  hlong gNboundaryFaces = 0;
+  for(hlong n=0;n<gNelements;++n){
+    int ElementType;
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
+    sscanf(buf, "%*d%d", &ElementType);
+    if(ElementType==1) ++gNboundaryFaces;
+    if(ElementType==2) ++Ntriangles;
+  }
+  // rewind to start of elements
+  fsetpos(fp, &fpos);
+
+  hlong chunk = (hlong) Ntriangles/size;
+  int remainder = (int) (Ntriangles - chunk*size);
+
+  hlong NtrianglesLocal = chunk + (rank<remainder);
+
+  /* where do these elements start ? */
+  hlong start = rank*chunk + std::min(rank, remainder);
+  hlong end   = start + NtrianglesLocal-1;
+
+  /* allocate space for Element node index data */
+  EToV.malloc(NtrianglesLocal*Nverts);
+  elementInfo.malloc(NtrianglesLocal);
+
+  /* scan through file looking for triangle elements */
+  hlong cnt=0, bcnt=0;
+  Ntriangles = 0;
+
+  boundaryInfo.malloc(gNboundaryFaces*3);
+  for(hlong n=0;n<gNelements;++n){
+    int ElementType;
+    hlong v1, v2, v3;
+    //read to end of line
+    LIBP_ABORT("Error reading mesh file: " << fileName,
+               !fgets(buf, BUFSIZ, fp));
+    sscanf(buf, "%*d%d", &ElementType);
+    if(ElementType==1){ // boundary face
+      sscanf(buf, "%*d%*d %*d" hlongFormat "%*d" hlongFormat hlongFormat,
+             boundaryInfo.ptr()+bcnt*3, &v1, &v2);
+      boundaryInfo[bcnt*3+1] = v1-1;
+      boundaryInfo[bcnt*3+2] = v2-1;
+      ++bcnt;
+    }
+    if(ElementType==2){  // triangle
+      if(start<=Ntriangles && Ntriangles<=end){
+        sscanf(buf, "%*d%*d%*d " hlongFormat " %*d" hlongFormat hlongFormat hlongFormat,
+               elementInfo.ptr()+cnt, &v1, &v2, &v3);
+
+        // check orientation
+        // dfloat xe1 = VX[v1-1], xe2 = VX[v2-1], xe3 = VX[v3-1];
+        // dfloat ye1 = VY[v1-1], ye2 = VY[v2-1], ye3 = VY[v3-1];
+        // dfloat ze1 = VZ[v1-1], ze2 = VZ[v2-1], ze3 = VZ[v3-1];
+
+#if 0
+        // TW: no idea
+        dfloat J = 0.25*((xe2-xe1)*(ye3-ye1) - (xe3-xe1)*(ye2-ye1));
+        if(J<0){
+          int v3tmp = v3;
+          v3 = v2;
+          v2 = v3tmp;
+          //      printf("unwarping element\n");
+        }
+#endif
+
+        /* read vertex triplet for trianngle */
+        EToV[cnt*Nverts+0] = v1-1;
+        EToV[cnt*Nverts+1] = v2-1;
+        EToV[cnt*Nverts+2] = v3-1;
+
+        ++cnt;
+      }
+      ++Ntriangles;
+    }
+  }
+  fclose(fp);
+
+  /* record number of boundary faces found */
+  NboundaryFaces = bcnt;
+
+  /* record number of found triangles */
+  Nelements = NtrianglesLocal;
+
+  /* collect vertices for each element */
+  EX.malloc(Nverts*Nelements);
+  EY.malloc(Nverts*Nelements);
+  EZ.malloc(Nverts*Nelements);
+  for(int e=0;e<Nelements;++e){
+    for(int n=0;n<Nverts;++n){
+      EX[e*Nverts+n] = VX[EToV[e*Nverts+n]];
+      EY[e*Nverts+n] = VY[EToV[e*Nverts+n]];
+      EZ[e*Nverts+n] = VZ[EToV[e*Nverts+n]];
+    }
+  }
+}
+
+} //namespace libp
diff --git a/libs/mesh/meshReferenceNodesHex3D.cpp b/libs/mesh/meshReferenceNodesHex3D.cpp
index cad7e6fcb..a2e596a65 100644
--- a/libs/mesh/meshReferenceNodesHex3D.cpp
+++ b/libs/mesh/meshReferenceNodesHex3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,65 +25,52 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
 
-void meshHex3D::ReferenceNodes(int N_){
+namespace libp {
+
+void mesh_t::ReferenceNodesHex3D(){
 
-  N = N_;
   Nq = N+1;
   Nfp = Nq*Nq;
   Np = Nq*Nq*Nq;
 
   /* Nodal Data */
-  r = (dfloat *) malloc(Np*sizeof(dfloat));
-  s = (dfloat *) malloc(Np*sizeof(dfloat));
-  t = (dfloat *) malloc(Np*sizeof(dfloat));
   NodesHex3D(N, r, s, t);
-
-  faceNodes = (int *) malloc(Nfaces*Nfp*sizeof(int));
   FaceNodesHex3D(N, r, s, t, faceNodes);
-
-  vertexNodes = (int*) calloc(Nverts, sizeof(int));
   VertexNodesHex3D(N, r, s, t, vertexNodes);
 
   //GLL quadrature
-  dfloat *gllz = (dfloat *) malloc((N+1)*sizeof(dfloat));
-  w = (dfloat *) malloc((N+1)*sizeof(dfloat));
-  JacobiGLL(N, gllz, w);
+  JacobiGLL(N, gllz, gllw);
 
   //Lumped Mass matrix
-  MM    = (dfloat *) malloc(Np*Np*sizeof(dfloat));
-  invMM = (dfloat *) malloc(Np*Np*sizeof(dfloat));
-  LumpedMassMatrixHex3D(N, w, MM);
-  invLumpedMassMatrixHex3D(N, w, invMM);
+  LumpedMassMatrixHex3D(N, gllw, MM);
+  invLumpedMassMatrixHex3D(N, gllw, invMM);
 
   // D matrix
-  D = (dfloat *) malloc(Nq*Nq*sizeof(dfloat));
-  Dmatrix1D(N, Nq, gllz, Nq, gllz, D);
+  Dmatrix1D(N, gllz, gllz, D);
+  o_D = platform.malloc<dfloat>(D);
 
   /* Plotting data */
-  plotN = N_ + 3; //enriched interpolation space for plotting
+  plotN = N + 3; //enriched interpolation space for plotting
   plotNq = plotN + 1;
   plotNp = plotNq*plotNq*plotNq;
 
   /* Plotting nodes */
-  plotR = (dfloat *) malloc(plotNp*sizeof(dfloat));
-  plotS = (dfloat *) malloc(plotNp*sizeof(dfloat));
-  plotT = (dfloat *) malloc(plotNp*sizeof(dfloat));
   EquispacedNodesHex3D(plotN, plotR, plotS, plotT);
 
   plotNelements = 6*plotN*plotN*plotN;
   plotNverts = 4;
-  plotEToV = (int*) malloc(plotNelements*plotNverts*sizeof(int));
   EquispacedEToVHex3D(plotN, plotEToV);
 
-  dfloat *plot1D = (dfloat *) malloc(plotNq*sizeof(dfloat));
+  memory<dfloat> plot1D;
   EquispacedNodes1D(plotN, plot1D);
+  InterpolationMatrix1D(N, gllz, plot1D, plotInterp);
 
-  plotInterp = (dfloat *) malloc(Nq*plotNq*sizeof(dfloat));
-  InterpolationMatrix1D(N, Nq, gllz, plotNq, plot1D, plotInterp);
-
-  free(gllz);
-  free(plot1D);
+  props["defines/" "p_N"]= N;
+  props["defines/" "p_Nq"]= Nq;
+  props["defines/" "p_Np"]= Np;
+  props["defines/" "p_Nfp"]= Nfp;
+  props["defines/" "p_NfacesNfp"]= Nfp*Nfaces;
 }
 
+} //namespace libp
diff --git a/libs/mesh/meshReferenceNodesQuad2D.cpp b/libs/mesh/meshReferenceNodesQuad2D.cpp
index cb56703b9..46a0ae1db 100644
--- a/libs/mesh/meshReferenceNodesQuad2D.cpp
+++ b/libs/mesh/meshReferenceNodesQuad2D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,70 +25,52 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
-#include "mesh/mesh3D.hpp"
 
-void meshQuad3D::ReferenceNodes(int N_){
-  mesh_t *mesh_p = (mesh_t*) this;
-  meshQuad2D* quadmesh = (meshQuad2D*) mesh_p;
-  quadmesh->meshQuad2D::ReferenceNodes(N_);
-}
+namespace libp {
 
-void meshQuad2D::ReferenceNodes(int N_){
+void mesh_t::ReferenceNodesQuad2D(){
 
-  N = N_;
-  Nfp = N+1;
   Nq = (N+1);
+  Nfp = N+1;
   Np = (N+1)*(N+1);
 
   /* Nodal Data */
-  r = (dfloat *) malloc(Np*sizeof(dfloat));
-  s = (dfloat *) malloc(Np*sizeof(dfloat));
   NodesQuad2D(N, r, s);
-
-  faceNodes = (int *) malloc(Nfaces*Nfp*sizeof(int));
   FaceNodesQuad2D(N, r, s, faceNodes);
-
-  vertexNodes = (int*) calloc(Nverts, sizeof(int));
   VertexNodesQuad2D(N, r, s, vertexNodes);
 
   //GLL quadrature
-  dfloat *gllz = (dfloat *) malloc((N+1)*sizeof(dfloat));
-  w = (dfloat *) malloc((N+1)*sizeof(dfloat));
-  JacobiGLL(N, gllz, w);
+  JacobiGLL(N, gllz, gllw);
 
   //Lumped Mass matrix
-  MM    = (dfloat *) malloc(Np*Np*sizeof(dfloat));
-  invMM = (dfloat *) malloc(Np*Np*sizeof(dfloat));
-  LumpedMassMatrixQuad2D(N, w, MM);
-  invLumpedMassMatrixQuad2D(N, w, invMM);
+  LumpedMassMatrixQuad2D(N, gllw, MM);
+  invLumpedMassMatrixQuad2D(N, gllw, invMM);
 
   // D matrix
-  D = (dfloat *) malloc(Nq*Nq*sizeof(dfloat));
-  Dmatrix1D(N, Nq, gllz, Nq, gllz, D);
+  Dmatrix1D(N, gllz, gllz, D);
+  o_D = platform.malloc<dfloat>(D);
 
   /* Plotting data */
-  plotN = N_ + 3; //enriched interpolation space for plotting
+  plotN = N + 3; //enriched interpolation space for plotting
   plotNq = plotN + 1;
   plotNp = plotNq*plotNq;
 
   /* Plotting nodes */
-  plotR = (dfloat *) malloc(plotNp*sizeof(dfloat));
-  plotS = (dfloat *) malloc(plotNp*sizeof(dfloat));
   EquispacedNodesQuad2D(plotN, plotR, plotS);
 
   plotNelements = 2*plotN*plotN;
   plotNverts = 3;
-  plotEToV = (int*) malloc(plotNelements*plotNverts*sizeof(int));
   EquispacedEToVQuad2D(plotN, plotEToV);
 
-  dfloat *plot1D = (dfloat *) malloc(plotNq*sizeof(dfloat));
+  memory<dfloat> plot1D;
   EquispacedNodes1D(plotN, plot1D);
+  InterpolationMatrix1D(N, gllz, plot1D, plotInterp);
 
-  plotInterp = (dfloat *) malloc(Nq*plotNq*sizeof(dfloat));
-  InterpolationMatrix1D(N, Nq, gllz, plotNq, plot1D, plotInterp);
-
-  free(gllz);
-  free(plot1D);
+  props["defines/" "p_N"]= N;
+  props["defines/" "p_Nq"]= Nq;
+  props["defines/" "p_Np"]= Np;
+  props["defines/" "p_Nfp"]= Nfp;
+  props["defines/" "p_NfacesNfp"]= Nfp*Nfaces;
 }
 
+}//namespace libp
diff --git a/libs/mesh/meshReferenceNodesTet3D.cpp b/libs/mesh/meshReferenceNodesTet3D.cpp
index e85321969..8c8372c9c 100644
--- a/libs/mesh/meshReferenceNodesTet3D.cpp
+++ b/libs/mesh/meshReferenceNodesTet3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,76 +25,95 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
 
-void meshTet3D::ReferenceNodes(int N_){
+namespace libp {
+
+void mesh_t::ReferenceNodesTet3D(){
 
-  N = N_;
   Nfp = ((N+1)*(N+2))/2;
   Np = ((N+1)*(N+2)*(N+3))/6;
 
   /* Nodal Data */
-  r = (dfloat *) malloc(Np*sizeof(dfloat));
-  s = (dfloat *) malloc(Np*sizeof(dfloat));
-  t = (dfloat *) malloc(Np*sizeof(dfloat));
   NodesTet3D(N, r, s, t);
-
-  faceNodes = (int *) malloc(Nfaces*Nfp*sizeof(int));
   FaceNodesTet3D(N, r, s, t, faceNodes);
-
-  vertexNodes = (int*) calloc(Nverts, sizeof(int));
   VertexNodesTet3D(N, r, s, t, vertexNodes);
 
-  dfloat *V = (dfloat *) malloc(Np*Np*sizeof(dfloat));
-  VandermondeTet3D(N, Np, r, s, t, V);
+  memory<dfloat> V;
+  VandermondeTet3D(N, r, s, t, V);
 
   //Mass matrix
-  MM = (dfloat *) malloc(Np*Np*sizeof(dfloat));
-  invMM = (dfloat *) malloc(Np*Np*sizeof(dfloat));
   MassMatrixTet3D(Np, V, MM);
   invMassMatrixTet3D(Np, V, invMM);
-  free(V);
+  o_MM = platform.malloc<dfloat>(MM); //MM is symmetric
 
   //packed D matrices
-  D  = (dfloat *) malloc(3*Np*Np*sizeof(dfloat));
+  DmatrixTet3D(N, r, s, t, D);
   Dr = D + 0*Np*Np;
   Ds = D + 1*Np*Np;
   Dt = D + 2*Np*Np;
-  DmatrixTet3D(N, Np, r, s, t, Dr, Ds, Dt);
 
-  LIFT = (dfloat *) malloc(Np*Nfaces*Nfp*sizeof(dfloat));
-  LIFTmatrixTet3D(N, faceNodes, r, s, t, LIFT);
+  memory<dfloat> DT(3*Np*Np);
+  memory<dfloat> DrT = DT + 0*Np*Np;
+  memory<dfloat> DsT = DT + 1*Np*Np;
+  memory<dfloat> DtT = DT + 2*Np*Np;
+  linAlg_t::matrixTranspose(Np, Np, Dr, Np, DrT, Np);
+  linAlg_t::matrixTranspose(Np, Np, Ds, Np, DsT, Np);
+  linAlg_t::matrixTranspose(Np, Np, Dt, Np, DtT, Np);
+  o_D = platform.malloc<dfloat>(DT);
 
-  sM = (dfloat *) calloc(Np*Nfaces*Nfp,sizeof(dfloat));
+  LIFTmatrixTet3D(N, faceNodes, r, s, t, LIFT);
   SurfaceMassMatrixTet3D(N, MM, LIFT, sM);
 
+  memory<dfloat> LIFTT(Np*Nfaces*Nfp);
+  linAlg_t::matrixTranspose(Np, Nfp*Nfaces, LIFT, Nfp*Nfaces, LIFTT, Np);
+
+  memory<dfloat> sMT(Np*Nfaces*Nfp);
+  linAlg_t::matrixTranspose(Np, Nfp*Nfaces, sM, Nfp*Nfaces, sMT, Np);
+
+  o_sM = platform.malloc<dfloat>(sMT);
+  o_LIFT = platform.malloc<dfloat>(LIFTT);
+
   //packed stiffness matrices
-  S = (dfloat*) calloc(6*Np*Np, sizeof(dfloat));
+  SmatrixTet3D(N, Dr, Ds, Dt, MM, S);
   Srr = S + 0*Np*Np;
   Srs = S + 1*Np*Np;
   Srt = S + 2*Np*Np;
   Sss = S + 3*Np*Np;
   Sst = S + 4*Np*Np;
   Stt = S + 5*Np*Np;
-  SmatrixTet3D(N, Dr, Ds, Dt, MM, Srr, Srs, Srt, Sss, Sst, Stt);
+
+  memory<dfloat> ST(6*Np*Np);
+  memory<dfloat> SrrT = ST + 0*Np*Np;
+  memory<dfloat> SrsT = ST + 1*Np*Np;
+  memory<dfloat> SrtT = ST + 2*Np*Np;
+  memory<dfloat> SssT = ST + 3*Np*Np;
+  memory<dfloat> SstT = ST + 4*Np*Np;
+  memory<dfloat> SttT = ST + 5*Np*Np;
+  linAlg_t::matrixTranspose(Np, Np, Srr, Np, SrrT, Np);
+  linAlg_t::matrixTranspose(Np, Np, Srs, Np, SrsT, Np);
+  linAlg_t::matrixTranspose(Np, Np, Srt, Np, SrtT, Np);
+  linAlg_t::matrixTranspose(Np, Np, Sss, Np, SssT, Np);
+  linAlg_t::matrixTranspose(Np, Np, Sst, Np, SstT, Np);
+  linAlg_t::matrixTranspose(Np, Np, Stt, Np, SttT, Np);
+
+  o_S = platform.malloc<dfloat>(ST);
 
   /* Plotting data */
   plotN = N + 3; //enriched interpolation space for plotting
   plotNp = (plotN+1)*(plotN+2)*(plotN+3)/6;
 
   /* Plotting nodes */
-  plotR = (dfloat *) malloc(plotNp*sizeof(dfloat));
-  plotS = (dfloat *) malloc(plotNp*sizeof(dfloat));
-  plotT = (dfloat *) malloc(plotNp*sizeof(dfloat));
   EquispacedNodesTet3D(plotN, plotR, plotS, plotT);
 
   plotNelements = plotN*plotN*plotN;
   plotNverts = 4;
-  plotEToV = (int*) malloc(plotNelements*plotNverts*sizeof(int));
   EquispacedEToVTet3D(plotN, plotEToV);
+  InterpolationMatrixTet3D(N, r, s, t, plotR, plotS, plotT, plotInterp);
 
-  plotInterp = (dfloat *) malloc(Np*plotNp*sizeof(dfloat));
-  InterpolationMatrixTet3D(N, Np, r, s, t, plotNp, plotR, plotS, plotT, plotInterp);
+  props["defines/" "p_N"]= N;
+  props["defines/" "p_Np"]= Np;
+  props["defines/" "p_Nfp"]= Nfp;
+  props["defines/" "p_NfacesNfp"]= Nfp*Nfaces;
 }
 
-
+} //namespace libp
diff --git a/libs/mesh/meshReferenceNodesTri2D.cpp b/libs/mesh/meshReferenceNodesTri2D.cpp
index b8e6d5770..26b673137 100644
--- a/libs/mesh/meshReferenceNodesTri2D.cpp
+++ b/libs/mesh/meshReferenceNodesTri2D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,75 +25,83 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
-#include "mesh/mesh3D.hpp"
 
-void meshTri3D::ReferenceNodes(int N_){
-  mesh_t *mesh_p = (mesh_t*) this;
-  meshTri2D* trimesh = (meshTri2D*) mesh_p;
-  trimesh->meshTri2D::ReferenceNodes(N);
-}
+namespace libp {
 
-void meshTri2D::ReferenceNodes(int N_){
+void mesh_t::ReferenceNodesTri2D(){
 
-  N = N_;
   Nfp = N+1;
   Np = (N+1)*(N+2)/2;
 
   /* Nodal Data */
-  r = (dfloat *) malloc(Np*sizeof(dfloat));
-  s = (dfloat *) malloc(Np*sizeof(dfloat));
   NodesTri2D(N, r, s);
-
-  faceNodes = (int *) malloc(Nfaces*Nfp*sizeof(int));
   FaceNodesTri2D(N, r, s, faceNodes);
-
-  vertexNodes = (int*) calloc(Nverts, sizeof(int));
   VertexNodesTri2D(N, r, s, vertexNodes);
 
-  dfloat *V = (dfloat *) malloc(Np*Np*sizeof(dfloat));
-  VandermondeTri2D(N, Np, r, s, V);
+  memory<dfloat> V;
+  VandermondeTri2D(N, r, s, V);
 
   //Mass matrix
-  MM    = (dfloat *) malloc(Np*Np*sizeof(dfloat));
-  invMM = (dfloat *) malloc(Np*Np*sizeof(dfloat));
   MassMatrixTri2D(Np, V, MM);
   invMassMatrixTri2D(Np, V, invMM);
-  free(V);
+  o_MM = platform.malloc<dfloat>(MM); //MM is symmetric
 
   //packed D matrices
-  D  = (dfloat *) malloc(2*Np*Np*sizeof(dfloat));
+  DmatrixTri2D(N, r, s, D);
   Dr = D + 0*Np*Np;
   Ds = D + 1*Np*Np;
-  DmatrixTri2D(N, Np, r, s, Dr, Ds);
 
-  LIFT = (dfloat *) malloc(Np*Nfaces*Nfp*sizeof(dfloat));
-  LIFTmatrixTri2D(N, faceNodes, r, s, LIFT);
+  memory<dfloat> DT(2*Np*Np);
+  memory<dfloat> DrT = DT + 0*Np*Np;
+  memory<dfloat> DsT = DT + 1*Np*Np;
+  linAlg_t::matrixTranspose(Np, Np, Dr, Np, DrT, Np);
+  linAlg_t::matrixTranspose(Np, Np, Ds, Np, DsT, Np);
+  o_D = platform.malloc<dfloat>(DT);
 
-  sM = (dfloat *) calloc(Np*Nfaces*Nfp,sizeof(dfloat));
+  LIFTmatrixTri2D(N, faceNodes, r, s, LIFT);
   SurfaceMassMatrixTri2D(N, MM, LIFT, sM);
 
+  memory<dfloat> LIFTT(Np*Nfaces*Nfp);
+  linAlg_t::matrixTranspose(Np, Nfp*Nfaces, LIFT, Nfp*Nfaces, LIFTT, Np);
+
+  memory<dfloat> sMT(Np*Nfaces*Nfp);
+  linAlg_t::matrixTranspose(Np, Nfp*Nfaces, sM, Nfp*Nfaces, sMT, Np);
+
+  o_sM = platform.malloc<dfloat>(sMT);
+  o_LIFT = platform.malloc<dfloat>(LIFTT);
+
   //packed stiffness matrices
-  S = (dfloat*) calloc(3*Np*Np, sizeof(dfloat));
+  SmatrixTri2D(N, Dr, Ds, MM, S);
   Srr = S + 0*Np*Np;
   Srs = S + 1*Np*Np;
   Sss = S + 2*Np*Np;
-  SmatrixTri2D(N, Dr, Ds, MM, Srr, Srs, Sss);
+
+  memory<dfloat> ST(3*Np*Np);
+  memory<dfloat> SrrT = ST + 0*Np*Np;
+  memory<dfloat> SrsT = ST + 1*Np*Np;
+  memory<dfloat> SssT = ST + 2*Np*Np;
+  linAlg_t::matrixTranspose(Np, Np, Srr, Np, SrrT, Np);
+  linAlg_t::matrixTranspose(Np, Np, Srs, Np, SrsT, Np);
+  linAlg_t::matrixTranspose(Np, Np, Sss, Np, SssT, Np);
+
+  o_S = platform.malloc<dfloat>(ST);
 
   /* Plotting data */
   plotN = N + 3; //enriched interpolation space for plotting
   plotNp = (plotN+1)*(plotN+2)/2;
 
   /* Plotting nodes */
-  plotR = (dfloat *) malloc(plotNp*sizeof(dfloat));
-  plotS = (dfloat *) malloc(plotNp*sizeof(dfloat));
   EquispacedNodesTri2D(plotN, plotR, plotS);
 
   plotNelements = plotN*plotN;
   plotNverts = 3;
-  plotEToV = (int*) malloc(plotNelements*plotNverts*sizeof(int));
   EquispacedEToVTri2D(plotN, plotEToV);
+  InterpolationMatrixTri2D(N, r, s, plotR, plotS, plotInterp);
 
-  plotInterp = (dfloat *) malloc(Np*plotNp*sizeof(dfloat));
-  InterpolationMatrixTri2D(N, Np, r, s, plotNp, plotR, plotS, plotInterp);
+  props["defines/" "p_N"]= N;
+  props["defines/" "p_Np"]= Np;
+  props["defines/" "p_Nfp"]= Nfp;
+  props["defines/" "p_NfacesNfp"]= Nfp*Nfaces;
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshSetElementType.cpp b/libs/mesh/meshSetElementType.cpp
new file mode 100644
index 000000000..861f7130f
--- /dev/null
+++ b/libs/mesh/meshSetElementType.cpp
@@ -0,0 +1,90 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "mesh.hpp"
+
+namespace libp {
+
+void mesh_t::SetElementType(const Mesh::ElementType eType) {
+
+  if (eType==Mesh::TRIANGLES) {
+    elementType = Mesh::TRIANGLES;
+
+    Nverts = 3;        // number of vertices per element
+    Nfaces = 3;        // number of faces per element
+    NfaceVertices = 2; // number of vertices per face
+
+    // vertices on each face
+    int _faceVertices[4][2] = {{0,1},{1,2},{2,0}};
+
+    faceVertices.malloc(NfaceVertices*Nfaces);
+    faceVertices.copyFrom(_faceVertices[0]);
+
+  } else if (eType==Mesh::QUADRILATERALS) {
+    elementType = Mesh::QUADRILATERALS;
+
+    Nverts = 4;        // number of vertices per element
+    Nfaces = 4;        // number of faces per element
+    NfaceVertices = 2; // number of vertices per face
+
+    // vertices on each face
+    int _faceVertices[4][2] = {{0,1},{1,2},{2,3},{3,0}};
+
+    faceVertices.malloc(NfaceVertices*Nfaces);
+    faceVertices.copyFrom(_faceVertices[0]);
+
+  } else if (eType==Mesh::TETRAHEDRA) {
+    elementType = Mesh::TETRAHEDRA;
+
+    Nverts = 4;        // number of vertices per element
+    Nfaces = 4;        // number of faces per element
+    NfaceVertices = 3; // number of vertices per face
+
+    // vertices on each face
+    int _faceVertices[4][3] = {{0,1,2},{0,3,1},{1,3,2},{0,2,3}};
+
+    faceVertices.malloc(NfaceVertices*Nfaces);
+    faceVertices.copyFrom(_faceVertices[0]);
+
+  } else if (eType==Mesh::HEXAHEDRA) {
+    elementType = Mesh::HEXAHEDRA;
+
+    Nverts = 8;        // number of vertices per element
+    Nfaces = 6;        // number of faces per element
+    NfaceVertices = 4; // number of vertices per face
+
+    // vertices on each face
+    int _faceVertices[6][4] =
+      {{0,1,2,3},{0,4,5,1},{1,5,6,2},{2,6,7,3},{0,3,7,4},{4,7,6,5}};
+
+    faceVertices.malloc(NfaceVertices*Nfaces);
+    faceVertices.copyFrom(_faceVertices[0]);
+  } else {
+    LIBP_FORCE_ABORT("Unknown element type: " << eType);
+  }
+}
+
+} //namespace libp
diff --git a/libs/mesh/meshSettings.cpp b/libs/mesh/meshSettings.cpp
index 70f4a61d9..c8e5fa396 100644
--- a/libs/mesh/meshSettings.cpp
+++ b/libs/mesh/meshSettings.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,8 +25,11 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
+#include "parAdogs.hpp"
 
-meshSettings_t::meshSettings_t(MPI_Comm& _comm):
+namespace libp {
+
+meshSettings_t::meshSettings_t(comm_t _comm):
   settings_t(_comm) {
 
   newSetting("MESH FILE",
@@ -83,14 +86,13 @@ meshSettings_t::meshSettings_t(MPI_Comm& _comm):
              "4",
              "Degree of polynomial finite element space",
              {"1","2","3","4","5","6","7","8","9","10","11","12","13","14","15"});
+
+  paradogs::AddSettings(*this);
 }
 
 void meshSettings_t::report() {
 
-  int rank;
-  MPI_Comm_rank(comm, &rank);
-
-  if (rank==0) {
+  if (comm.rank()==0) {
     std::cout << "Mesh Settings:\n\n";
     if (!compareSetting("MESH FILE","BOX"))
       reportSetting("MESH FILE");
@@ -127,5 +129,11 @@ void meshSettings_t::report() {
     }
 
     reportSetting("POLYNOMIAL DEGREE");
+
+    if (!compareSetting("MESH FILE","BOX")) {
+      paradogs::ReportSettings(*this);
+    }
   }
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshSetup.cpp b/libs/mesh/meshSetup.cpp
index bb01929c1..479c75b47 100644
--- a/libs/mesh/meshSetup.cpp
+++ b/libs/mesh/meshSetup.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,94 +25,80 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
-#include "mesh/mesh3D.hpp"
 
-mesh_t& mesh_t::Setup(platform_t& platform, meshSettings_t& settings,
-                      MPI_Comm comm){
+namespace libp {
 
-  string fileName;
-  int N, dim, elementType;
+void mesh_t::Setup(platform_t& _platform, meshSettings_t& _settings,
+                   comm_t _comm){
 
-  settings.getSetting("MESH FILE", fileName);
-  settings.getSetting("POLYNOMIAL DEGREE", N);
-  settings.getSetting("ELEMENT TYPE", elementType);
+  platform = _platform;
+  settings = _settings;
+  props = platform.props();
+
+  comm = _comm.Dup();
+  rank = comm.rank();
+  size = comm.size();
+
+  int eType=0;
+  settings.getSetting("ELEMENT TYPE", eType);
   settings.getSetting("MESH DIMENSION", dim);
 
-  mesh_t *mesh=NULL;
-  switch(elementType){
-  case TRIANGLES:
-    if(dim==2)
-      mesh = new meshTri2D(platform, settings, comm);
-    else
-      mesh = new meshTri3D(platform, settings, comm);
-    break;
-  case QUADRILATERALS:
-    if(dim==2)
-      mesh = new meshQuad2D(platform, settings, comm);
-    else
-      mesh = new meshQuad3D(platform, settings, comm);
-    break;
-  case TETRAHEDRA:
-    mesh = new meshTet3D(platform, settings, comm);
-    break;
-  case HEXAHEDRA:
-    mesh = new meshHex3D(platform, settings, comm);
-    break;
-  }
 
-  mesh->elementType = elementType;
+  SetElementType(Mesh::ElementType(eType));
 
-  mesh->ringHalo = NULL;
+  props["defines/" "p_dim"]= dim;
+  props["defines/" "p_Nfaces"]= Nfaces;
+
+  std::string fileName;
+  settings.getSetting("MESH FILE", fileName);
 
   if (settings.compareSetting("MESH FILE","PMLBOX")) {
     //build a box mesh with a pml layer
-    mesh->SetupPmlBox();
+    SetupPmlBox();
   } else if (settings.compareSetting("MESH FILE","BOX")) {
     //build a box mesh
-    mesh->SetupBox();
+    SetupBox();
   } else {
     // read chunk of elements from file
-    mesh->ParallelReader(fileName.c_str());
+    ReadGmsh(fileName);
 
-    // partition elements using Morton ordering & parallel sort
-    mesh->GeometricPartition();
+    // partition elements using parAdogs
+    Partition();
   }
 
-  // connect elements using parallel sort
-  mesh->ParallelConnect();
+  // load reference (r,s) element nodes
+  settings.getSetting("POLYNOMIAL DEGREE", N);
+  ReferenceNodes();
 
-  // print out connectivity statistics
-  mesh->PrintPartitionStatistics();
+  // connect elements
+  Connect();
 
   // connect elements to boundary faces
-  mesh->ConnectBoundary();
-
-  // load reference (r,s) element nodes
-  mesh->ReferenceNodes(N);
+  ConnectBoundary();
 
   // set up halo exchange info for MPI (do before connect face nodes)
-  mesh->HaloSetup();
+  HaloSetup();
 
-  // compute physical (x,y) locations of the element nodes
-  mesh->PhysicalNodes();
+  // connect face vertices
+  ConnectFaceVertices();
 
-  // compute geometric factors
-  mesh->GeometricFactors();
+  // connect face nodes
+  ConnectFaceNodes();
 
-  // connect face nodes (find trace indices)
-  mesh->ConnectFaceNodes();
-
-  // compute surface geofacs
-  mesh->SurfaceGeometricFactors();
+  // make global indexing
+  ConnectNodes();
 
-  // make a global indexing
-  mesh->ParallelConnectNodes();
+  // compute physical (x,y) locations of the element nodes
+  PhysicalNodes();
 
-  // make an ogs operator and label local/global gather elements
-  mesh->ParallelGatherScatterSetup();
+  // compute geometric factors
+  GeometricFactors();
 
-  mesh->OccaSetup();
+  // compute surface geofacs
+  SurfaceGeometricFactors();
 
-  return *mesh;
+  // label local/global gather elements
+  GatherScatterSetup();
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshSetupBoxHex3D.cpp b/libs/mesh/meshSetupBoxHex3D.cpp
index 946c4c90b..eefe551c1 100644
--- a/libs/mesh/meshSetupBoxHex3D.cpp
+++ b/libs/mesh/meshSetupBoxHex3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,34 +25,21 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
 
-void meshHex3D::SetupBox(){
+namespace libp {
 
-  dim = 3;
-  Nverts = 8; // number of vertices per element
-  Nfaces = 6;
-  NfaceVertices = 4;
-
-  // vertices on each face
-  int _faceVertices[6][4] =
-    {{0,1,2,3},{0,1,5,4},{1,2,6,5},{2,3,7,6},{3,0,4,7},{4,5,6,7}};
-
-  faceVertices =
-    (int*) calloc(NfaceVertices*Nfaces, sizeof(int));
-
-  memcpy(faceVertices, _faceVertices[0], NfaceVertices*Nfaces*sizeof(int));
+void mesh_t::SetupBoxHex3D(){
 
   // find a factorization size = size_x*size_y*size_z such that
   //  size_x>=size_y>=size_z are all 'close' to one another
   int size_x, size_y, size_z;
-  factor3(size, size_x, size_y, size_z);
+  Factor3(size, size_x, size_y, size_z);
 
-  //find our coordinates in the MPI grid such that
-  // rank = rank_x + rank_y*size_x + rank_z*size_x*size_y
-  int rank_z = rank/(size_x*size_y);
-  int rank_y = (rank-rank_z*size_x*size_y)/size_x;
-  int rank_x = rank % size_x;
+  //determine (x,y,z) rank coordinates for this processes
+  int rank_x=-1, rank_y=-1, rank_z=-1;
+  RankDecomp3(size_x, size_y, size_z,
+              rank_x, rank_y, rank_z,
+              rank);
 
   //get global size from settings
   dlong NX, NY, NZ;
@@ -97,9 +84,9 @@ void meshHex3D::SetupBox(){
   dfloat dy = DIMY/NY;
   dfloat dz = DIMZ/NZ;
 
-  dlong offset_x = rank_x*(NX/size_x) + mymin(rank_x, (NX % size_x));
-  dlong offset_y = rank_y*(NY/size_y) + mymin(rank_y, (NY % size_y));
-  dlong offset_z = rank_z*(NZ/size_z) + mymin(rank_z, (NZ % size_z));
+  dlong offset_x = rank_x*(NX/size_x) + std::min(rank_x, (NX % size_x));
+  dlong offset_y = rank_y*(NY/size_y) + std::min(rank_y, (NY % size_y));
+  dlong offset_z = rank_z*(NZ/size_z) + std::min(rank_z, (NZ % size_z));
 
   //bottom corner of physical domain
   dfloat X0 = -DIMX/2.0 + offset_x*dx;
@@ -115,18 +102,20 @@ void meshHex3D::SetupBox(){
   Nnodes = NnX*NnY*NnZ; //global node count
   Nelements = nx*ny*nz; //local
 
-  EToV = (hlong*) calloc(Nelements*Nverts, sizeof(hlong));
-  EX = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat));
-  EY = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat));
-  EZ = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat));
+  EToV.malloc(Nelements*Nverts);
+  EX.malloc(Nelements*Nverts);
+  EY.malloc(Nelements*Nverts);
+  EZ.malloc(Nelements*Nverts);
 
-  elementInfo = (hlong*) calloc(Nelements, sizeof(hlong));
+  elementInfo.malloc(Nelements);
 
-  dlong e = 0;
+  #pragma omp parallel for collapse(3)
   for(int k=0;k<nz;++k){
     for(int j=0;j<ny;++j){
       for(int i=0;i<nx;++i){
 
+        const dlong e = i + j*nx + k*nx*ny;
+
         const hlong i0 = i+offset_x;
         const hlong i1 = (i+1+offset_x)%NnX;
         const hlong j0 = j+offset_y;
@@ -148,9 +137,9 @@ void meshHex3D::SetupBox(){
         dfloat y0 = Y0 + dy*j;
         dfloat z0 = Z0 + dz*k;
 
-        dfloat *ex = EX+e*Nverts;
-        dfloat *ey = EY+e*Nverts;
-        dfloat *ez = EZ+e*Nverts;
+        dfloat *ex = EX.ptr()+e*Nverts;
+        dfloat *ey = EY.ptr()+e*Nverts;
+        dfloat *ez = EZ.ptr()+e*Nverts;
 
         ex[0] = x0;    ey[0] = y0;    ez[0] = z0;
         ex[1] = x0+dx; ey[1] = y0;    ez[1] = z0;
@@ -163,16 +152,13 @@ void meshHex3D::SetupBox(){
         ex[7] = x0;    ey[7] = y0+dy; ez[7] = z0+dz;
 
         elementInfo[e] = 1; // domain
-        e++;
       }
     }
   }
 
-
-
   if (boundaryFlag != -1) { //-1 reserved for periodic case
     NboundaryFaces = 2*NX*NY + 2*NX*NZ + 2*NY*NZ;
-    boundaryInfo = (hlong*) calloc(NboundaryFaces*(NfaceVertices+1), sizeof(hlong));
+    boundaryInfo.malloc(NboundaryFaces*(NfaceVertices+1));
 
     hlong bcnt = 0;
 
@@ -241,7 +227,8 @@ void meshHex3D::SetupBox(){
     }
 
   } else {
-    NboundaryFaces = 0;
-    boundaryInfo = NULL; // no boundaries
+    NboundaryFaces = 0; // no boundaries
   }
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshSetupBoxQuad2D.cpp b/libs/mesh/meshSetupBoxQuad2D.cpp
index bad483446..e9db23102 100644
--- a/libs/mesh/meshSetupBoxQuad2D.cpp
+++ b/libs/mesh/meshSetupBoxQuad2D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,35 +25,21 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
-#include "mesh/mesh3D.hpp"
 
-void meshQuad3D::SetupBox(){
-  LIBP_ABORT(string("BOX mesh not currently supprted for Quad3D meshes."))
-}
-
-void meshQuad2D::SetupBox(){
-
-  dim = 2;
-  Nverts = 4; // number of vertices per element
-  Nfaces = 4;
-  NfaceVertices = 2;
+namespace libp {
 
-  // vertices on each face
-  int faceVertices_[4][2] = {{0,1},{1,2},{2,3},{3,0}};
-
-  faceVertices = (int*) calloc(NfaceVertices*Nfaces, sizeof(int));
-  memcpy(faceVertices, faceVertices_[0], NfaceVertices*Nfaces*sizeof(int));
+void mesh_t::SetupBoxQuad2D(){
 
   // find a factorization size = size_x*size_y such that
   //  size_x>=size_y and are 'close' to one another
   int size_x, size_y;
-  factor2(size, size_x, size_y);
+  Factor2(size, size_x, size_y);
 
-  //find our coordinates in the MPI grid such that
-  // rank = rank_x + rank_y*size_x
-  int rank_y = rank / size_x;
-  int rank_x = rank % size_x;
+  //determine (x,y) rank coordinates for this processes
+  int rank_x=-1, rank_y=-1;
+  RankDecomp2(size_x, size_y,
+              rank_x, rank_y,
+              rank);
 
   //get global size from settings
   dlong NX, NY;
@@ -91,8 +77,8 @@ void meshQuad2D::SetupBox(){
   dfloat dx = DIMX/NX;
   dfloat dy = DIMY/NY;
 
-  dlong offset_x = rank_x*(NX/size_x) + mymin(rank_x, (NX % size_x));
-  dlong offset_y = rank_y*(NY/size_y) + mymin(rank_y, (NY % size_y));
+  dlong offset_x = rank_x*(NX/size_x) + std::min(rank_x, (NX % size_x));
+  dlong offset_y = rank_y*(NY/size_y) + std::min(rank_y, (NY % size_y));
 
   //bottom corner of physical domain
   dfloat X0 = -DIMX/2.0 + offset_x*dx;
@@ -106,16 +92,18 @@ void meshQuad2D::SetupBox(){
   Nnodes = NnX*NnY; //global node count
   Nelements = nx*ny; //local
 
-  EToV = (hlong*) calloc(Nelements*Nverts, sizeof(hlong));
-  EX = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat));
-  EY = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat));
+  EToV.malloc(Nelements*Nverts);
+  EX.malloc(Nelements*Nverts);
+  EY.malloc(Nelements*Nverts);
 
-  elementInfo = (hlong*) calloc(Nelements, sizeof(hlong));
+  elementInfo.malloc(Nelements);
 
-  dlong e = 0;
+  #pragma omp parallel for collapse(2)
   for(int j=0;j<ny;++j){
     for(int i=0;i<nx;++i){
 
+      const dlong e = i + j*nx;
+
       const hlong i0 = i+offset_x;
       const hlong i1 = (i+1+offset_x)%NnX;
       const hlong j0 = j+offset_y;
@@ -129,8 +117,8 @@ void meshQuad2D::SetupBox(){
       dfloat x0 = X0 + dx*i;
       dfloat y0 = Y0 + dy*j;
 
-      dfloat *ex = EX+e*Nverts;
-      dfloat *ey = EY+e*Nverts;
+      dfloat *ex = EX.ptr()+e*Nverts;
+      dfloat *ey = EY.ptr()+e*Nverts;
 
       ex[0] = x0;    ey[0] = y0;
       ex[1] = x0+dx; ey[1] = y0;
@@ -138,14 +126,13 @@ void meshQuad2D::SetupBox(){
       ex[3] = x0;    ey[3] = y0+dy;
 
       elementInfo[e] = 1; // domain
-      e++;
     }
   }
 
 
   if (boundaryFlag != -1) { //-1 reserved for periodic case
     NboundaryFaces = 2*NX + 2*NY;
-    boundaryInfo = (hlong*) calloc(NboundaryFaces*(NfaceVertices+1), sizeof(hlong));
+    boundaryInfo.malloc(NboundaryFaces*(NfaceVertices+1));
 
     hlong bcnt = 0;
 
@@ -182,7 +169,8 @@ void meshQuad2D::SetupBox(){
     }
 
   } else {
-    NboundaryFaces = 0;
-    boundaryInfo = NULL; // no boundaries
+    NboundaryFaces = 0; // no boundaries
   }
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshSetupBoxTet3D.cpp b/libs/mesh/meshSetupBoxTet3D.cpp
index 8fc201702..95fddf3b1 100644
--- a/libs/mesh/meshSetupBoxTet3D.cpp
+++ b/libs/mesh/meshSetupBoxTet3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,31 +25,21 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
 
-void meshTet3D::SetupBox(){
+namespace libp {
 
-  dim = 3;
-  Nverts = 4; // number of vertices per element
-  Nfaces = 4;
-  NfaceVertices = 3;
-
-  // vertices on each face
-  int faceVertices_[4][3] = {{0,1,2},{0,1,3},{1,2,3},{2,0,3}};
-
-  faceVertices = (int*) calloc(NfaceVertices*Nfaces, sizeof(int));
-  memcpy(faceVertices, faceVertices_[0], 12*sizeof(int));
+void mesh_t::SetupBoxTet3D(){
 
   // find a factorization size = size_x*size_y*size_z such that
   //  size_x>=size_y>=size_z are all 'close' to one another
   int size_x, size_y, size_z;
-  factor3(size, size_x, size_y, size_z);
+  Factor3(size, size_x, size_y, size_z);
 
-  //find our coordinates in the MPI grid such that
-  // rank = rank_x + rank_y*size_x + rank_z*size_x*size_y
-  int rank_z = rank/(size_x*size_y);
-  int rank_y = (rank-rank_z*size_x*size_y)/size_x;
-  int rank_x = rank % size_x;
+  //determine (x,y,z) rank coordinates for this processes
+  int rank_x=-1, rank_y=-1, rank_z=-1;
+  RankDecomp3(size_x, size_y, size_z,
+              rank_x, rank_y, rank_z,
+              rank);
 
   //get global size from settings
   dlong NX, NY, NZ;
@@ -94,9 +84,9 @@ void meshTet3D::SetupBox(){
   dfloat dy = DIMY/NY;
   dfloat dz = DIMZ/NZ;
 
-  dlong offset_x = rank_x*(NX/size_x) + mymin(rank_x, (NX % size_x));
-  dlong offset_y = rank_y*(NY/size_y) + mymin(rank_y, (NY % size_y));
-  dlong offset_z = rank_z*(NZ/size_z) + mymin(rank_z, (NZ % size_z));
+  dlong offset_x = rank_x*(NX/size_x) + std::min(rank_x, (NX % size_x));
+  dlong offset_y = rank_y*(NY/size_y) + std::min(rank_y, (NY % size_y));
+  dlong offset_z = rank_z*(NZ/size_z) + std::min(rank_z, (NZ % size_z));
 
   //bottom corner of physical domain
   dfloat X0 = -DIMX/2.0 + offset_x*dx;
@@ -112,18 +102,20 @@ void meshTet3D::SetupBox(){
   Nnodes = NnX*NnY*NnZ; //global node count
   Nelements = 6*nx*ny*nz; //local element count (each cube divided into 6 tets)
 
-  EToV = (hlong*) calloc(Nelements*Nverts, sizeof(hlong));
-  EX = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat));
-  EY = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat));
-  EZ = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat));
+  EToV.malloc(Nelements*Nverts);
+  EX.malloc(Nelements*Nverts);
+  EY.malloc(Nelements*Nverts);
+  EZ.malloc(Nelements*Nverts);
 
-  elementInfo = (hlong*) calloc(Nelements, sizeof(hlong));
+  elementInfo.malloc(Nelements);
 
-  dlong e = 0;
+  #pragma omp parallel for collapse(3)
   for(int k=0;k<nz;++k){
     for(int j=0;j<ny;++j){
       for(int i=0;i<nx;++i){
 
+        dlong e = 6*(i + j*nx + k*nx*ny);
+
         const hlong i0 = i+offset_x;
         const hlong i1 = (i+1+offset_x)%NnX;
         const hlong j0 = j+offset_y;
@@ -226,7 +218,7 @@ void meshTet3D::SetupBox(){
 
   if (boundaryFlag != -1) { //-1 reserved for periodic case
     NboundaryFaces = 4*NX*NY + 4*NX*NZ + 4*NY*NZ;
-    boundaryInfo = (hlong*) calloc(NboundaryFaces*(NfaceVertices+1), sizeof(hlong));
+    boundaryInfo.malloc(NboundaryFaces*(NfaceVertices+1));
 
     hlong bcnt = 0;
 
@@ -319,7 +311,8 @@ void meshTet3D::SetupBox(){
     }
 
   } else {
-    NboundaryFaces = 0;
-    boundaryInfo = NULL; // no boundaries
+    NboundaryFaces = 0; // no boundaries
   }
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshSetupBoxTri2D.cpp b/libs/mesh/meshSetupBoxTri2D.cpp
index cfec4b370..0cb26178c 100644
--- a/libs/mesh/meshSetupBoxTri2D.cpp
+++ b/libs/mesh/meshSetupBoxTri2D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,35 +25,21 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
-#include "mesh/mesh3D.hpp"
 
-void meshTri3D::SetupBox(){
-  LIBP_ABORT(string("BOX mesh not currently supprted for Tri3D meshes."))
-}
-
-void meshTri2D::SetupBox(){
-
-  dim = 2;
-  Nverts = 3; // number of vertices per element
-  Nfaces = 3;
-  NfaceVertices = 2;
+namespace libp {
 
-  // vertices on each face
-  int faceVertices_[4][2] = {{0,1},{1,2},{2,0}};
-
-  faceVertices = (int*) calloc(NfaceVertices*Nfaces, sizeof(int));
-  memcpy(faceVertices, faceVertices_[0], NfaceVertices*Nfaces*sizeof(int));
+void mesh_t::SetupBoxTri2D(){
 
   // find a factorization size = size_x*size_y such that
   //  size_x>=size_y and are 'close' to one another
   int size_x, size_y;
-  factor2(size, size_x, size_y);
+  Factor2(size, size_x, size_y);
 
-  //find our coordinates in the MPI grid such that
-  // rank = rank_x + rank_y*size_x
-  int rank_y = rank / size_x;
-  int rank_x = rank % size_x;
+  //determine (x,y) rank coordinates for this processes
+  int rank_x=-1, rank_y=-1;
+  RankDecomp2(size_x, size_y,
+              rank_x, rank_y,
+              rank);
 
   //get global size from settings
   dlong NX, NY;
@@ -91,8 +77,8 @@ void meshTri2D::SetupBox(){
   dfloat dx = DIMX/NX;
   dfloat dy = DIMY/NY;
 
-  dlong offset_x = rank_x*(NX/size_x) + mymin(rank_x, (NX % size_x));
-  dlong offset_y = rank_y*(NY/size_y) + mymin(rank_y, (NY % size_y));
+  dlong offset_x = rank_x*(NX/size_x) + std::min(rank_x, (NX % size_x));
+  dlong offset_y = rank_y*(NY/size_y) + std::min(rank_y, (NY % size_y));
 
   //bottom corner of physical domain
   dfloat X0 = -DIMX/2.0 + offset_x*dx;
@@ -106,16 +92,18 @@ void meshTri2D::SetupBox(){
   Nnodes = NnX*NnY; //global node count
   Nelements = 2*nx*ny; //local
 
-  EToV = (hlong*) calloc(Nelements*Nverts, sizeof(hlong));
-  EX = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat));
-  EY = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat));
+  EToV.malloc(Nelements*Nverts);
+  EX.malloc(Nelements*Nverts);
+  EY.malloc(Nelements*Nverts);
 
-  elementInfo = (hlong*) calloc(Nelements, sizeof(hlong));
+  elementInfo.malloc(Nelements);
 
-  dlong e = 0;
+  #pragma omp parallel for collapse(2)
   for(int j=0;j<ny;++j){
     for(int i=0;i<nx;++i){
 
+      dlong e = 2*(i + j*nx);
+
       const hlong i0 = i+offset_x;
       const hlong i1 = (i+1+offset_x)%NnX;
       const hlong j0 = j+offset_y;
@@ -151,7 +139,7 @@ void meshTri2D::SetupBox(){
 
   if (boundaryFlag != -1) { //-1 reserved for periodic case
     NboundaryFaces = 2*NX + 2*NY;
-    boundaryInfo = (hlong*) calloc(NboundaryFaces*(NfaceVertices+1), sizeof(hlong));
+    boundaryInfo.malloc(NboundaryFaces*(NfaceVertices+1));
 
     hlong bcnt = 0;
 
@@ -188,7 +176,8 @@ void meshTri2D::SetupBox(){
     }
 
   } else {
-    NboundaryFaces = 0;
-    boundaryInfo = NULL; // no boundaries
+    NboundaryFaces = 0; // no boundaries
   }
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshSetupNewDegree.cpp b/libs/mesh/meshSetupNewDegree.cpp
index c99174150..fb7d70394 100644
--- a/libs/mesh/meshSetupNewDegree.cpp
+++ b/libs/mesh/meshSetupNewDegree.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,106 +25,42 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
-#include "mesh/mesh3D.hpp"
+
+namespace libp {
 
 //build a new mesh object from another with a different degree.
-mesh_t& mesh_t::SetupNewDegree(int Nf){
+mesh_t mesh_t::SetupNewDegree(int Nf){
+
+  // Copy the existing object
+  mesh_t mesh=*this;
 
   //just reuse the current mesh if the degree isnt changing.
-  if (Nf==N) return *this;
-
-  mesh_t *mesh=NULL;
-  switch(elementType){
-  case TRIANGLES:
-    if(dim==2)
-      mesh = new meshTri2D(platform, settings, comm);
-    else
-      mesh = new meshTri3D(platform, settings, comm);
-    break;
-  case QUADRILATERALS:
-    if(dim==2)
-      mesh = new meshQuad2D(platform, settings, comm);
-    else
-      mesh = new meshQuad3D(platform, settings, comm);
-    break;
-  case TETRAHEDRA:
-    mesh = new meshTet3D(platform, settings, comm);
-    break;
-  case HEXAHEDRA:
-    mesh = new meshHex3D(platform, settings, comm);
-    break;
-  }
-
-  //shallow copy of base mesh geometry
-  mesh->dim           = dim;
-  mesh->Nverts        = Nverts;
-  mesh->Nfaces        = Nfaces;
-  mesh->NfaceVertices = NfaceVertices;
-  mesh->faceVertices  = faceVertices;
-
-  mesh->elementType = elementType;
-
-  mesh->Nnodes = Nnodes;
-  mesh->EX = EX; // coordinates of vertices for each element
-  mesh->EY = EY;
-  mesh->EZ = EZ;
-
-  mesh->Nelements = Nelements;
-  mesh->NelementsGlobal = NelementsGlobal;
-  mesh->EToV = EToV; // element-to-vertex connectivity
-  mesh->EToE = EToE; // element-to-element connectivity
-  mesh->EToF = EToF; // element-to-(local)face connectivity
-  mesh->EToP = EToP; // element-to-partition/process connectivity
-  mesh->EToB = EToB; // element-to-boundary condition type
-
-  mesh->elementInfo = elementInfo;
-
-  mesh->NboundaryFaces = NboundaryFaces;
-  mesh->boundaryInfo = boundaryInfo;
-
-  mesh->halo = halo;
-  mesh->NinternalElements = NinternalElements;
-  mesh->NhaloElements = NhaloElements;
-  mesh->totalHaloPairs = totalHaloPairs;
-  mesh->internalElementIds = internalElementIds;
-  mesh->haloElementIds = haloElementIds;
-  mesh->o_internalElementIds = o_internalElementIds;
-  mesh->o_haloElementIds     = o_haloElementIds;
-
-  mesh->ogs = ogs;
-  mesh->globalIds = globalIds;
-
-  mesh->NglobalGatherElements = NglobalGatherElements;
-  mesh->globalGatherElementList = globalGatherElementList;
-  mesh->o_globalGatherElementList = o_globalGatherElementList;
-
-  mesh->NlocalGatherElements = NlocalGatherElements;
-  mesh->localGatherElementList = localGatherElementList;
-  mesh->o_localGatherElementList = o_localGatherElementList;
+  if (Nf==N) return mesh;
+
+  mesh.N = Nf;
 
   // load reference (r,s) element nodes
-  mesh->ReferenceNodes(Nf);
+  mesh.ReferenceNodes();
+
+  // connect face nodes (find trace indices)
+  mesh.ConnectFaceNodes();
+
+  // make a global indexing
+  mesh.ConnectNodes();
 
   // compute physical (x,y) locations of the element nodes
-  mesh->PhysicalNodes();
+  mesh.PhysicalNodes();
 
   // compute geometric factors
-  mesh->GeometricFactors();
-
-  // connect face nodes (find trace indices)
-  mesh->ConnectFaceNodes();
+  mesh.GeometricFactors();
 
   // compute surface geofacs
-  mesh->SurfaceGeometricFactors();
-
-  // make a global indexing
-  mesh->ParallelConnectNodes();
-
-  // make an ogs operator and label local/global gather elements
-  mesh->ParallelGatherScatterSetup();
+  mesh.SurfaceGeometricFactors();
 
-  mesh->OccaSetup();
+  // label local/global gather elements
+  mesh.GatherScatterSetup();
 
-  return *mesh;
+  return mesh;
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshSetupPmlBoxHex3D.cpp b/libs/mesh/meshSetupPmlBoxHex3D.cpp
index 1b9149b02..348f5e580 100644
--- a/libs/mesh/meshSetupPmlBoxHex3D.cpp
+++ b/libs/mesh/meshSetupPmlBoxHex3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,39 +25,26 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
+
+namespace libp {
 
 static void addHex(hlong i0, hlong j0, hlong k0, hlong NnX, hlong NnY, hlong NnZ,
                     dfloat x0, dfloat y0, dfloat z0, dfloat dx, dfloat dy, dfloat dz,
-                    hlong *EToV, dfloat *EX, dfloat *EY, dfloat *EZ,
-                    hlong *elementInfo, int type, dlong &e);
-
-void meshHex3D::SetupPmlBox(){
-
-  dim = 3;
-  Nverts = 8; // number of vertices per element
-  Nfaces = 6;
-  NfaceVertices = 4;
-
-  // vertices on each face
-  int _faceVertices[6][4] =
-    {{0,1,2,3},{0,1,5,4},{1,2,6,5},{2,3,7,6},{3,0,4,7},{4,5,6,7}};
+                    memory<hlong> EToV, memory<dfloat> EX, memory<dfloat> EY, memory<dfloat> EZ,
+                    memory<hlong> elementInfo, int type, dlong &e);
 
-  faceVertices =
-    (int*) calloc(NfaceVertices*Nfaces, sizeof(int));
-
-  memcpy(faceVertices, _faceVertices[0], NfaceVertices*Nfaces*sizeof(int));
+void mesh_t::SetupPmlBoxHex3D(){
 
   // find a factorization size = size_x*size_y*size_z such that
   //  size_x>=size_y>=size_z are all 'close' to one another
   int size_x, size_y, size_z;
-  factor3(size, size_x, size_y, size_z);
+  Factor3(size, size_x, size_y, size_z);
 
-  //find our coordinates in the MPI grid such that
-  // rank = rank_x + rank_y*size_x + rank_z*size_x*size_y
-  int rank_z = rank/(size_x*size_y);
-  int rank_y = (rank-rank_z*size_x*size_y)/size_x;
-  int rank_x = rank % size_x;
+  //determine (x,y,z) rank coordinates for this processes
+  int rank_x=-1, rank_y=-1, rank_z=-1;
+  RankDecomp3(size_x, size_y, size_z,
+              rank_x, rank_y, rank_z,
+              rank);
 
   //get global size from settings
   dlong NX, NY, NZ;
@@ -90,8 +77,8 @@ void meshHex3D::SetupPmlBox(){
   settings.getSetting("BOX BOUNDARY FLAG", boundaryFlag);
 
   const int periodicFlag = (boundaryFlag == -1) ? 1 : 0;
-  if (periodicFlag)
-    LIBP_ABORT(string("Periodic boundary unsupported for PMLBOX mesh."))
+  LIBP_ABORT("Periodic boundary unsupported for PMLBOX mesh.",
+             periodicFlag);
 
   //local grid physical sizes
   dfloat DIMX, DIMY, DIMZ;
@@ -107,9 +94,9 @@ void meshHex3D::SetupPmlBox(){
   dfloat dy = DIMY/NY;
   dfloat dz = DIMZ/NZ;
 
-  dlong offset_x = rank_x*(NX/size_x) + mymin(rank_x, (NX % size_x));
-  dlong offset_y = rank_y*(NY/size_y) + mymin(rank_y, (NY % size_y));
-  dlong offset_z = rank_z*(NZ/size_z) + mymin(rank_z, (NZ % size_z));
+  dlong offset_x = rank_x*(NX/size_x) + std::min(rank_x, (NX % size_x));
+  dlong offset_y = rank_y*(NY/size_y) + std::min(rank_y, (NY % size_y));
+  dlong offset_z = rank_z*(NZ/size_z) + std::min(rank_z, (NZ % size_z));
 
   //local grid physical sizes
   dfloat dimx = nx*dx;
@@ -183,12 +170,12 @@ void meshHex3D::SetupPmlBox(){
   if (rank_x==size_x-1 && rank_y==size_y-1 && rank_z==size_z-1) Nelements+=pmlNx*pmlNy*pmlNz;
 
 
-  EToV = (hlong*) calloc(Nelements*Nverts, sizeof(hlong));
-  EX = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat));
-  EY = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat));
-  EZ = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat));
+  EToV.malloc(Nelements*Nverts);
+  EX.malloc(Nelements*Nverts);
+  EY.malloc(Nelements*Nverts);
+  EZ.malloc(Nelements*Nverts);
 
-  elementInfo = (hlong*) calloc(Nelements, sizeof(hlong));
+  elementInfo.malloc(Nelements);
 
   dlong e = 0;
   for(int k=0;k<nz;++k){
@@ -851,7 +838,7 @@ void meshHex3D::SetupPmlBox(){
 
   if (boundaryFlag != -1) { //-1 reserved for periodic case
     NboundaryFaces = 2*NX*NY + 2*NX*NZ + 2*NY*NZ;
-    boundaryInfo = (hlong*) calloc(NboundaryFaces*(NfaceVertices+1), sizeof(hlong));
+    boundaryInfo.malloc(NboundaryFaces*(NfaceVertices+1));
 
     hlong bcnt = 0;
 
@@ -920,15 +907,14 @@ void meshHex3D::SetupPmlBox(){
     }
 
   } else {
-    NboundaryFaces = 0;
-    boundaryInfo = NULL; // no boundaries
+    NboundaryFaces = 0; // no boundaries
   }
 }
 
 static void addHex(hlong i0, hlong j0, hlong k0, hlong NnX, hlong NnY, hlong NnZ,
                     dfloat x0, dfloat y0, dfloat z0, dfloat dx, dfloat dy, dfloat dz,
-                    hlong *EToV, dfloat *EX, dfloat *EY, dfloat *EZ,
-                    hlong *elementInfo, int type, dlong &e) {
+                    memory<hlong> EToV, memory<dfloat> EX, memory<dfloat> EY, memory<dfloat> EZ,
+                    memory<hlong> elementInfo, int type, dlong &e) {
 
   const hlong i1 = (i0+1)%NnX;
   const hlong j1 = (j0+1)%NnY;
@@ -946,9 +932,9 @@ static void addHex(hlong i0, hlong j0, hlong k0, hlong NnX, hlong NnY, hlong NnZ
   EToV[e*Nverts+6] = i1 + j1*NnX + k1*NnX*NnY;
   EToV[e*Nverts+7] = i0 + j1*NnX + k1*NnX*NnY;
 
-  dfloat *ex = EX+e*Nverts;
-  dfloat *ey = EY+e*Nverts;
-  dfloat *ez = EZ+e*Nverts;
+  dfloat *ex = EX.ptr()+e*Nverts;
+  dfloat *ey = EY.ptr()+e*Nverts;
+  dfloat *ez = EZ.ptr()+e*Nverts;
 
   ex[0] = x0;    ey[0] = y0;    ez[0] = z0;
   ex[1] = x0+dx; ey[1] = y0;    ez[1] = z0;
@@ -962,4 +948,6 @@ static void addHex(hlong i0, hlong j0, hlong k0, hlong NnX, hlong NnY, hlong NnZ
 
   elementInfo[e] = type;
   e++;
-}
\ No newline at end of file
+}
+
+} //namespace libp
diff --git a/libs/mesh/meshSetupPmlBoxQuad2D.cpp b/libs/mesh/meshSetupPmlBoxQuad2D.cpp
index 60a3c5d9d..094de2c0d 100644
--- a/libs/mesh/meshSetupPmlBoxQuad2D.cpp
+++ b/libs/mesh/meshSetupPmlBoxQuad2D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,35 +25,21 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
-#include "mesh/mesh3D.hpp"
 
-void meshQuad3D::SetupPmlBox(){
-  LIBP_ABORT(string("PMLBOX mesh not currently supprted for Quad3D meshes."))
-}
-
-void meshQuad2D::SetupPmlBox(){
-
-  dim = 2;
-  Nverts = 4; // number of vertices per element
-  Nfaces = 4;
-  NfaceVertices = 2;
+namespace libp {
 
-  // vertices on each face
-  int faceVertices_[4][2] = {{0,1},{1,2},{2,3},{3,0}};
-
-  faceVertices = (int*) calloc(NfaceVertices*Nfaces, sizeof(int));
-  memcpy(faceVertices, faceVertices_[0], NfaceVertices*Nfaces*sizeof(int));
+void mesh_t::SetupPmlBoxQuad2D(){
 
   // find a factorization size = size_x*size_y such that
-  //  size_x>=size_y and are all 'close' to one another
+  //  size_x>=size_y and are 'close' to one another
   int size_x, size_y;
-  factor2(size, size_x, size_y);
+  Factor2(size, size_x, size_y);
 
-  //find our coordinates in the MPI grid such that
-  // rank = rank_x + rank_y*size_x
-  int rank_y = rank / size_x;
-  int rank_x = rank % size_x;
+  //determine (x,y) rank coordinates for this processes
+  int rank_x=-1, rank_y=-1;
+  RankDecomp2(size_x, size_y,
+              rank_x, rank_y,
+              rank);
 
   //get global size from settings
   dlong NX, NY;
@@ -81,8 +67,8 @@ void meshQuad2D::SetupPmlBox(){
   settings.getSetting("BOX BOUNDARY FLAG", boundaryFlag);
 
   const int periodicFlag = (boundaryFlag == -1) ? 1 : 0;
-  if (periodicFlag)
-    LIBP_ABORT(string("Periodic boundary unsupported for PMLBOX mesh."))
+  LIBP_ABORT("Periodic boundary unsupported for PMLBOX mesh.",
+             periodicFlag);
 
   //local grid physical sizes
   dfloat DIMX, DIMY;
@@ -96,8 +82,8 @@ void meshQuad2D::SetupPmlBox(){
   dfloat dx = DIMX/NX;
   dfloat dy = DIMY/NY;
 
-  dlong offset_x = rank_x*(NX/size_x) + mymin(rank_x, (NX % size_x));
-  dlong offset_y = rank_y*(NY/size_y) + mymin(rank_y, (NY % size_y));
+  dlong offset_x = rank_x*(NX/size_x) + std::min(rank_x, (NX % size_x));
+  dlong offset_y = rank_y*(NY/size_y) + std::min(rank_y, (NY % size_y));
 
   //local grid physical sizes
   dfloat dimx = nx*dx;
@@ -143,11 +129,11 @@ void meshQuad2D::SetupPmlBox(){
   if (rank_x==0        && rank_y==size_y-1) Nelements+=pmlNx*pmlNy;
   if (rank_x==size_x-1 && rank_y==size_y-1) Nelements+=pmlNx*pmlNy;
 
-  EToV = (hlong*) calloc(Nelements*Nverts, sizeof(hlong));
-  EX = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat));
-  EY = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat));
+  EToV.malloc(Nelements*Nverts);
+  EX.malloc(Nelements*Nverts);
+  EY.malloc(Nelements*Nverts);
 
-  elementInfo = (hlong*) calloc(Nelements, sizeof(hlong));
+  elementInfo.malloc(Nelements);
 
   dlong e = 0;
 
@@ -168,8 +154,8 @@ void meshQuad2D::SetupPmlBox(){
       dfloat x0 = X0 + dx*i;
       dfloat y0 = Y0 + dy*j;
 
-      dfloat *ex = EX+e*Nverts;
-      dfloat *ey = EY+e*Nverts;
+      dfloat *ex = EX.ptr()+e*Nverts;
+      dfloat *ey = EY.ptr()+e*Nverts;
 
       ex[0] = x0;    ey[0] = y0;
       ex[1] = x0+dx; ey[1] = y0;
@@ -199,8 +185,8 @@ void meshQuad2D::SetupPmlBox(){
         dfloat x0 = X0-pmlWidthx + pmldx*i;
         dfloat y0 = Y0 + dy*j;
 
-        dfloat *ex = EX+e*Nverts;
-        dfloat *ey = EY+e*Nverts;
+        dfloat *ex = EX.ptr()+e*Nverts;
+        dfloat *ey = EY.ptr()+e*Nverts;
 
         ex[0] = x0;       ey[0] = y0;
         ex[1] = x0+pmldx; ey[1] = y0;
@@ -231,8 +217,8 @@ void meshQuad2D::SetupPmlBox(){
         dfloat x0 = X0 + dimx + pmldx*i;
         dfloat y0 = Y0 + dy*j;
 
-        dfloat *ex = EX+e*Nverts;
-        dfloat *ey = EY+e*Nverts;
+        dfloat *ex = EX.ptr()+e*Nverts;
+        dfloat *ey = EY.ptr()+e*Nverts;
 
         ex[0] = x0;       ey[0] = y0;
         ex[1] = x0+pmldx; ey[1] = y0;
@@ -263,8 +249,8 @@ void meshQuad2D::SetupPmlBox(){
         dfloat x0 = X0 + dx*i;
         dfloat y0 = Y0-pmlWidthy + pmldy*j;
 
-        dfloat *ex = EX+e*Nverts;
-        dfloat *ey = EY+e*Nverts;
+        dfloat *ex = EX.ptr()+e*Nverts;
+        dfloat *ey = EY.ptr()+e*Nverts;
 
         ex[0] = x0;    ey[0] = y0;
         ex[1] = x0+dx; ey[1] = y0;
@@ -295,8 +281,8 @@ void meshQuad2D::SetupPmlBox(){
         dfloat x0 = X0 + dx*i;
         dfloat y0 = Y0 + dimy + pmldy*j;
 
-        dfloat *ex = EX+e*Nverts;
-        dfloat *ey = EY+e*Nverts;
+        dfloat *ex = EX.ptr()+e*Nverts;
+        dfloat *ey = EY.ptr()+e*Nverts;
 
         ex[0] = x0;    ey[0] = y0;
         ex[1] = x0+dx; ey[1] = y0;
@@ -327,8 +313,8 @@ void meshQuad2D::SetupPmlBox(){
         dfloat x0 = X0-pmlWidthx + pmldx*i;
         dfloat y0 = Y0-pmlWidthy + pmldy*j;
 
-        dfloat *ex = EX+e*Nverts;
-        dfloat *ey = EY+e*Nverts;
+        dfloat *ex = EX.ptr()+e*Nverts;
+        dfloat *ey = EY.ptr()+e*Nverts;
 
         ex[0] = x0;       ey[0] = y0;
         ex[1] = x0+pmldx; ey[1] = y0;
@@ -359,8 +345,8 @@ void meshQuad2D::SetupPmlBox(){
         dfloat x0 = X0+dimx      + pmldx*i;
         dfloat y0 = Y0-pmlWidthy + pmldy*j;
 
-        dfloat *ex = EX+e*Nverts;
-        dfloat *ey = EY+e*Nverts;
+        dfloat *ex = EX.ptr()+e*Nverts;
+        dfloat *ey = EY.ptr()+e*Nverts;
 
         ex[0] = x0;       ey[0] = y0;
         ex[1] = x0+pmldx; ey[1] = y0;
@@ -391,8 +377,8 @@ void meshQuad2D::SetupPmlBox(){
         dfloat x0 = X0-pmlWidthx + pmldx*i;
         dfloat y0 = Y0+dimy      + pmldy*j;
 
-        dfloat *ex = EX+e*Nverts;
-        dfloat *ey = EY+e*Nverts;
+        dfloat *ex = EX.ptr()+e*Nverts;
+        dfloat *ey = EY.ptr()+e*Nverts;
 
         ex[0] = x0;       ey[0] = y0;
         ex[1] = x0+pmldx; ey[1] = y0;
@@ -423,8 +409,8 @@ void meshQuad2D::SetupPmlBox(){
         dfloat x0 = X0+dimx      + pmldx*i;
         dfloat y0 = Y0+dimy      + pmldy*j;
 
-        dfloat *ex = EX+e*Nverts;
-        dfloat *ey = EY+e*Nverts;
+        dfloat *ex = EX.ptr()+e*Nverts;
+        dfloat *ey = EY.ptr()+e*Nverts;
 
         ex[0] = x0;       ey[0] = y0;
         ex[1] = x0+pmldx; ey[1] = y0;
@@ -439,7 +425,7 @@ void meshQuad2D::SetupPmlBox(){
 
   if (boundaryFlag != -1) { //-1 reserved for periodic case
     NboundaryFaces = 2*NX + 2*NY;
-    boundaryInfo = (hlong*) calloc(NboundaryFaces*(NfaceVertices+1), sizeof(hlong));
+    boundaryInfo.malloc(NboundaryFaces*(NfaceVertices+1));
 
     hlong bcnt = 0;
 
@@ -476,7 +462,8 @@ void meshQuad2D::SetupPmlBox(){
     }
 
   } else {
-    NboundaryFaces = 0;
-    boundaryInfo = NULL; // no boundaries
+    NboundaryFaces = 0; // no boundaries
   }
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshSetupPmlBoxTet3D.cpp b/libs/mesh/meshSetupPmlBoxTet3D.cpp
index 093631167..aaf5863ff 100644
--- a/libs/mesh/meshSetupPmlBoxTet3D.cpp
+++ b/libs/mesh/meshSetupPmlBoxTet3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,36 +25,26 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
+
+namespace libp {
 
 static void addTets(hlong i0, hlong j0, hlong k0, hlong NnX, hlong NnY, hlong NnZ,
                     dfloat x0, dfloat y0, dfloat z0, dfloat dx, dfloat dy, dfloat dz,
-                    hlong *EToV, dfloat *EX, dfloat *EY, dfloat *EZ,
-                    hlong *elementInfo, int type, dlong &e);
-
-void meshTet3D::SetupPmlBox(){
-
-  dim = 3;
-  Nverts = 4; // number of vertices per element
-  Nfaces = 4;
-  NfaceVertices = 3;
+                    memory<hlong> EToV, memory<dfloat> EX, memory<dfloat> EY, memory<dfloat> EZ,
+                    memory<hlong> elementInfo, int type, dlong &e);
 
-  // vertices on each face
-  int faceVertices_[4][3] = {{0,1,2},{0,1,3},{1,2,3},{2,0,3}};
-
-  faceVertices = (int*) calloc(NfaceVertices*Nfaces, sizeof(int));
-  memcpy(faceVertices, faceVertices_[0], 12*sizeof(int));
+void mesh_t::SetupPmlBoxTet3D(){
 
   // find a factorization size = size_x*size_y*size_z such that
   //  size_x>=size_y>=size_z are all 'close' to one another
   int size_x, size_y, size_z;
-  factor3(size, size_x, size_y, size_z);
+  Factor3(size, size_x, size_y, size_z);
 
-  //find our coordinates in the MPI grid such that
-  // rank = rank_x + rank_y*size_x + rank_z*size_x*size_y
-  int rank_z = rank/(size_x*size_y);
-  int rank_y = (rank-rank_z*size_x*size_y)/size_x;
-  int rank_x = rank % size_x;
+  //determine (x,y,z) rank coordinates for this processes
+  int rank_x=-1, rank_y=-1, rank_z=-1;
+  RankDecomp3(size_x, size_y, size_z,
+              rank_x, rank_y, rank_z,
+              rank);
 
   //get global size from settings
   dlong NX, NY, NZ;
@@ -87,8 +77,8 @@ void meshTet3D::SetupPmlBox(){
   settings.getSetting("BOX BOUNDARY FLAG", boundaryFlag);
 
   const int periodicFlag = (boundaryFlag == -1) ? 1 : 0;
-  if (periodicFlag)
-    LIBP_ABORT(string("Periodic boundary unsupported for PMLBOX mesh."))
+  LIBP_ABORT("Periodic boundary unsupported for PMLBOX mesh.",
+             periodicFlag);
 
   //local grid physical sizes
   dfloat DIMX, DIMY, DIMZ;
@@ -104,9 +94,9 @@ void meshTet3D::SetupPmlBox(){
   dfloat dy = DIMY/NY;
   dfloat dz = DIMZ/NZ;
 
-  dlong offset_x = rank_x*(NX/size_x) + mymin(rank_x, (NX % size_x));
-  dlong offset_y = rank_y*(NY/size_y) + mymin(rank_y, (NY % size_y));
-  dlong offset_z = rank_z*(NZ/size_z) + mymin(rank_z, (NZ % size_z));
+  dlong offset_x = rank_x*(NX/size_x) + std::min(rank_x, (NX % size_x));
+  dlong offset_y = rank_y*(NY/size_y) + std::min(rank_y, (NY % size_y));
+  dlong offset_z = rank_z*(NZ/size_z) + std::min(rank_z, (NZ % size_z));
 
   //local grid physical sizes
   dfloat dimx = nx*dx;
@@ -180,12 +170,12 @@ void meshTet3D::SetupPmlBox(){
   if (rank_x==0        && rank_y==size_y-1 && rank_z==size_z-1) Nelements+=6*pmlNx*pmlNy*pmlNz;
   if (rank_x==size_x-1 && rank_y==size_y-1 && rank_z==size_z-1) Nelements+=6*pmlNx*pmlNy*pmlNz;
 
-  EToV = (hlong*) calloc(Nelements*Nverts, sizeof(hlong));
-  EX = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat));
-  EY = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat));
-  EZ = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat));
+  EToV.malloc(Nelements*Nverts);
+  EX.malloc(Nelements*Nverts);
+  EY.malloc(Nelements*Nverts);
+  EZ.malloc(Nelements*Nverts);
 
-  elementInfo = (hlong*) calloc(Nelements, sizeof(hlong));
+  elementInfo.malloc(Nelements);
 
   dlong e = 0;
   for(int k=0;k<nz;++k){
@@ -846,7 +836,7 @@ void meshTet3D::SetupPmlBox(){
 
   if (boundaryFlag != -1) { //-1 reserved for periodic case
     NboundaryFaces = 4*NX*NY + 4*NX*NZ + 4*NY*NZ;
-    boundaryInfo = (hlong*) calloc(NboundaryFaces*(NfaceVertices+1), sizeof(hlong));
+    boundaryInfo.malloc(NboundaryFaces*(NfaceVertices+1));
 
     hlong bcnt = 0;
 
@@ -939,15 +929,14 @@ void meshTet3D::SetupPmlBox(){
     }
 
   } else {
-    NboundaryFaces = 0;
-    boundaryInfo = NULL; // no boundaries
+    NboundaryFaces = 0; // no boundaries
   }
 }
 
 static void addTets(hlong i0, hlong j0, hlong k0, hlong NnX, hlong NnY, hlong NnZ,
                     dfloat x0, dfloat y0, dfloat z0, dfloat dx, dfloat dy, dfloat dz,
-                    hlong *EToV, dfloat *EX, dfloat *EY, dfloat *EZ,
-                    hlong *elementInfo, int type, dlong &e) {
+                    memory<hlong> EToV, memory<dfloat> EX, memory<dfloat> EY, memory<dfloat> EZ,
+                    memory<hlong> elementInfo, int type, dlong &e) {
 
   const hlong i1 = (i0+1)%NnX;
   const hlong j1 = (j0+1)%NnY;
@@ -1038,4 +1027,6 @@ static void addTets(hlong i0, hlong j0, hlong k0, hlong NnX, hlong NnY, hlong Nn
 
   elementInfo[e] = type;
   e++;
-}
\ No newline at end of file
+}
+
+} //namespace libp
diff --git a/libs/mesh/meshSetupPmlBoxTri2D.cpp b/libs/mesh/meshSetupPmlBoxTri2D.cpp
index a0db5f11e..0ab065d25 100644
--- a/libs/mesh/meshSetupPmlBoxTri2D.cpp
+++ b/libs/mesh/meshSetupPmlBoxTri2D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,35 +25,21 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
-#include "mesh/mesh3D.hpp"
 
-void meshTri3D::SetupPmlBox(){
-  LIBP_ABORT(string("PMLBOX mesh not currently supprted for Tri3D meshes."))
-}
-
-void meshTri2D::SetupPmlBox(){
-
-  dim = 2;
-  Nverts = 3; // number of vertices per element
-  Nfaces = 3;
-  NfaceVertices = 2;
+namespace libp {
 
-  // vertices on each face
-  int faceVertices_[4][2] = {{0,1},{1,2},{2,0}};
-
-  faceVertices = (int*) calloc(NfaceVertices*Nfaces, sizeof(int));
-  memcpy(faceVertices, faceVertices_[0], NfaceVertices*Nfaces*sizeof(int));
+void mesh_t::SetupPmlBoxTri2D(){
 
   // find a factorization size = size_x*size_y such that
-  //  size_x>=size_y and are all 'close' to one another
+  //  size_x>=size_y and are 'close' to one another
   int size_x, size_y;
-  factor2(size, size_x, size_y);
+  Factor2(size, size_x, size_y);
 
-  //find our coordinates in the MPI grid such that
-  // rank = rank_x + rank_y*size_x
-  int rank_y = rank / size_x;
-  int rank_x = rank % size_x;
+  //determine (x,y) rank coordinates for this processes
+  int rank_x=-1, rank_y=-1;
+  RankDecomp2(size_x, size_y,
+              rank_x, rank_y,
+              rank);
 
   //get global size from settings
   dlong NX, NY;
@@ -81,8 +67,8 @@ void meshTri2D::SetupPmlBox(){
   settings.getSetting("BOX BOUNDARY FLAG", boundaryFlag);
 
   const int periodicFlag = (boundaryFlag == -1) ? 1 : 0;
-  if (periodicFlag)
-    LIBP_ABORT(string("Periodic boundary unsupported for PMLBOX mesh."))
+  LIBP_ABORT("Periodic boundary unsupported for PMLBOX mesh.",
+             periodicFlag);
 
   //local grid physical sizes
   dfloat DIMX, DIMY;
@@ -96,8 +82,8 @@ void meshTri2D::SetupPmlBox(){
   dfloat dx = DIMX/NX;
   dfloat dy = DIMY/NY;
 
-  dlong offset_x = rank_x*(NX/size_x) + mymin(rank_x, (NX % size_x));
-  dlong offset_y = rank_y*(NY/size_y) + mymin(rank_y, (NY % size_y));
+  dlong offset_x = rank_x*(NX/size_x) + std::min(rank_x, (NX % size_x));
+  dlong offset_y = rank_y*(NY/size_y) + std::min(rank_y, (NY % size_y));
 
   //local grid physical sizes
   dfloat dimx = nx*dx;
@@ -145,11 +131,11 @@ void meshTri2D::SetupPmlBox(){
   if (rank_x==size_x-1 && rank_y==size_y-1) Nelements+=2*pmlNx*pmlNy;
 
 
-  EToV = (hlong*) calloc(Nelements*Nverts, sizeof(hlong));
-  EX = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat));
-  EY = (dfloat*) calloc(Nelements*Nverts, sizeof(dfloat));
+  EToV.malloc(Nelements*Nverts);
+  EX.malloc(Nelements*Nverts);
+  EY.malloc(Nelements*Nverts);
 
-  elementInfo = (hlong*) calloc(Nelements, sizeof(hlong));
+  elementInfo.malloc(Nelements);
 
   dlong e = 0;
 
@@ -495,7 +481,7 @@ void meshTri2D::SetupPmlBox(){
 
   if (boundaryFlag != -1) { //-1 reserved for periodic case
     NboundaryFaces = 2*NX + 2*NY;
-    boundaryInfo = (hlong*) calloc(NboundaryFaces*(NfaceVertices+1), sizeof(hlong));
+    boundaryInfo.malloc(NboundaryFaces*(NfaceVertices+1));
 
     hlong bcnt = 0;
 
@@ -532,7 +518,8 @@ void meshTri2D::SetupPmlBox(){
     }
 
   } else {
-    NboundaryFaces = 0;
-    boundaryInfo = NULL; // no boundaries
+    NboundaryFaces = 0; // no boundaries
   }
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshSetupRingPatch.cpp b/libs/mesh/meshSetupRingPatch.cpp
index d3dcc966b..3d6c46b13 100644
--- a/libs/mesh/meshSetupRingPatch.cpp
+++ b/libs/mesh/meshSetupRingPatch.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,195 +25,101 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
-#include "mesh/mesh3D.hpp"
+
+namespace libp {
 
 //build a new mesh object consisting of the orignal mesh with an
 // 1-element overlap with neighboring meshes
-mesh_t* mesh_t::SetupRingPatch(){
+mesh_t mesh_t::SetupRingPatch(){
 
   //setup the 1-ring halo exchange
   HaloRingSetup();
 
+  /*Copy underlying mesh object*/
+  mesh_t mesh = *this;
+
   //just reuse the current mesh if there are no neighbors
-  if (size==1) return this;
+  if (size==1) return mesh;
 
   // single process communicator for new mesh
-  MPI_Comm* splitComm = new MPI_Comm;
-  MPI_Comm_split(comm, rank, rank, splitComm);
-
-  mesh_t *mesh=NULL;
-  switch(elementType){
-  case TRIANGLES:
-    if(dim==2)
-      mesh = new meshTri2D(platform, settings, *splitComm);
-    else
-      mesh = new meshTri3D(platform, settings, *splitComm);
-    break;
-  case QUADRILATERALS:
-    if(dim==2)
-      mesh = new meshQuad2D(platform, settings, *splitComm);
-    else
-      mesh = new meshQuad3D(platform, settings, *splitComm);
-    break;
-  case TETRAHEDRA:
-    mesh = new meshTet3D(platform, settings, *splitComm);
-    break;
-  case HEXAHEDRA:
-    mesh = new meshHex3D(platform, settings, *splitComm);
-    break;
-  }
-
-  //shallow copy of base mesh geometry
-  mesh->dim           = dim;
-  mesh->Nverts        = Nverts;
-  mesh->Nfaces        = Nfaces;
-  mesh->NfaceVertices = NfaceVertices;
-  mesh->faceVertices  = faceVertices;
+  mesh.comm = comm.Split(rank, rank);
+  mesh.rank = mesh.comm.rank();
+  mesh.size = mesh.comm.size();
 
-  mesh->elementType = elementType;
-
-  mesh->Nnodes = Nnodes; //not really correct, but unused
-  mesh->Nelements = Nelements+totalRingElements;
-  mesh->NelementsGlobal = Nelements+totalRingElements;
+  mesh.Nelements = Nelements+totalRingElements;
+  mesh.NelementsGlobal = Nelements+totalRingElements;
 
   //populate mesh vertices
-  mesh->EX = (dfloat*) calloc(mesh->Nelements*Nverts, sizeof(dfloat));
-  mesh->EY = (dfloat*) calloc(mesh->Nelements*Nverts, sizeof(dfloat));
+  mesh.EX.malloc(mesh.Nelements*Nverts);
+  mesh.EY.malloc(mesh.Nelements*Nverts);
   if(dim==3)
-    mesh->EZ = (dfloat*) calloc(mesh->Nelements*Nverts, sizeof(dfloat));
+    mesh.EZ.malloc(mesh.Nelements*Nverts);
 
-  memcpy(mesh->EX, EX, Nelements*Nverts*sizeof(dfloat));
-  memcpy(mesh->EY, EY, Nelements*Nverts*sizeof(dfloat));
+  mesh.EX.copyFrom(EX, Nelements*Nverts);
+  mesh.EY.copyFrom(EY, Nelements*Nverts);
   if(dim==3)
-    memcpy(mesh->EZ, EZ, Nelements*Nverts*sizeof(dfloat));
+    mesh.EZ.copyFrom(EZ, Nelements*Nverts);
 
-  ringHalo->Exchange(mesh->EX, Nverts, ogs_dfloat);
-  ringHalo->Exchange(mesh->EY, Nverts, ogs_dfloat);
+  ringHalo.Exchange(mesh.EX, Nverts);
+  ringHalo.Exchange(mesh.EY, Nverts);
   if(dim==3)
-    ringHalo->Exchange(mesh->EZ, Nverts, ogs_dfloat);
+    ringHalo.Exchange(mesh.EZ, Nverts);
 
-  mesh->EToV = (hlong*) calloc(mesh->Nelements*Nverts, sizeof(hlong));
-  memcpy(mesh->EToV, EToV, Nelements*Nverts*sizeof(hlong));
-  ringHalo->Exchange(mesh->EToV, Nverts, ogs_hlong);
+  mesh.EToV.malloc(mesh.Nelements*Nverts);
+  mesh.EToV.copyFrom(EToV, Nelements*Nverts);
+  ringHalo.Exchange(mesh.EToV, Nverts);
 
-  mesh->elementInfo = (hlong*) calloc(mesh->Nelements, sizeof(hlong));
-  memcpy(mesh->elementInfo, elementInfo, Nelements*sizeof(hlong));
-  ringHalo->Exchange(mesh->elementInfo, 1, ogs_hlong);
+  mesh.elementInfo.malloc(mesh.Nelements);
+  mesh.elementInfo.copyFrom(elementInfo, Nelements);
+  ringHalo.Exchange(mesh.elementInfo, 1);
 
   // connect elements using parallel sort
-  mesh->ParallelConnect();
+  mesh.Connect();
 
-  mesh->NboundaryFaces = NboundaryFaces;
-  mesh->boundaryInfo = boundaryInfo;
+  mesh.NboundaryFaces = NboundaryFaces;
+  mesh.boundaryInfo = boundaryInfo;
 
   // element-to-boundary condition type
-  mesh->EToB = (int*) calloc(mesh->Nelements*Nfaces, sizeof(int));
-  memcpy(mesh->EToB, EToB, Nelements*Nfaces*sizeof(int));
-  ringHalo->Exchange(mesh->EToB, Nfaces, ogs_int);
+  mesh.EToB.malloc(mesh.Nelements*Nfaces);
+  mesh.EToB.copyFrom(EToB, Nelements*Nfaces);
+  ringHalo.Exchange(mesh.EToB, Nfaces);
 
   // correct bcs (replaces unconnected faces with Dirichlet)
-  for(dlong e=0;e<mesh->Nelements;++e){
+  for(dlong e=0;e<mesh.Nelements;++e){
     for(int f=0;f<Nfaces;++f){
       dlong id = e*Nfaces+f;
-      if(mesh->EToE[id]==-1 && mesh->EToB[id]==-1){
-        mesh->EToB[id] = 1; // hack to 1 assume Dirichlet
-        mesh->EToE[id] = e; // hack to 1 assume Dirichlet
+      if(mesh.EToE[id]==-1 && mesh.EToB[id]==-1){
+        mesh.EToB[id] = 1; // hack to 1 assume Dirichlet
+        mesh.EToE[id] = e; // hack to 1 assume Dirichlet
       }
     }
   }
 
-  //Reference Nodes
-  mesh->N = N;
-  mesh->Np = Np;
-  mesh->Nq = Nq;
-  mesh->Nfp = Nfp;
-
-  mesh->vertexNodes = vertexNodes;
-
-  mesh->r = r;
-  mesh->s = s;
-  mesh->t = t;
-
-  mesh->w = w;
-
-  mesh->D = D;
-  mesh->Dr = Dr;
-  mesh->Ds = Ds;
-  mesh->Dt = Dt;
-  mesh->S = S;
-  mesh->Srr = Srr;
-  mesh->Srs = Srs;
-  mesh->Srt = Srt;
-  mesh->Sss = Sss;
-  mesh->Sst = Sst;
-  mesh->Stt = Stt;
-  mesh->MM = MM;
-  mesh->invMM = invMM;
-  mesh->sM = sM;
-  mesh->faceNodes = faceNodes;
-  mesh->LIFT = LIFT;
-
-  mesh->plotNp = plotNp;
-  mesh->plotNelements = plotNelements;
-  mesh->plotNverts = plotNverts;
-  mesh->plotR = plotR;
-  mesh->plotS = plotS;
-  mesh->plotT = plotT;
-  mesh->plotInterp = plotInterp;
-  mesh->plotEToV = plotEToV;
-
-  mesh->cubNp = cubNp;
-  mesh->cubNq = cubNq;
-  mesh->cubNfp = cubNfp;
-  mesh->cubr = cubr;
-  mesh->cubs = cubs;
-  mesh->cubt = cubt;
-  mesh->cubw = cubw;
-  mesh->cubInterp = cubInterp;
-  mesh->cubProject = cubProject;
-  mesh->cubD = cubD;
-  mesh->cubPDT = cubPDT;
-  mesh->cubPDrT = cubPDrT;
-  mesh->cubPDsT = cubPDsT;
-  mesh->cubPDtT = cubPDtT;
-  mesh->intNfp = intNfp;
-  mesh->intInterp = intInterp;
-  mesh->intLIFT = intLIFT;
-
-  mesh->NpFEM = NpFEM;
-  mesh->NelFEM = NelFEM;
-  mesh->rFEM = rFEM;
-  mesh->sFEM = sFEM;
-  mesh->tFEM = tFEM;
-  mesh->SEMFEMInterp = SEMFEMInterp;
-  mesh->FEMEToV = FEMEToV;
-
-
-  mesh->ringHalo = NULL;
-
   //Halo
-  mesh->HaloSetup();
+  mesh.HaloSetup();
 
-  // compute physical (x,y) locations of the element nodes
-  mesh->PhysicalNodes();
-
-  // compute geometric factors
-  mesh->GeometricFactors();
+  // connect face vertices
+  mesh.ConnectFaceVertices();
 
   // connect face nodes (find trace indices)
-  mesh->ConnectFaceNodes();
+  mesh.ConnectFaceNodes();
 
-  // compute surface geofacs
-  mesh->SurfaceGeometricFactors();
+  // make global indexing
+  mesh.ConnectNodes();
 
-  // make a global indexing
-  mesh->ParallelConnectNodes();
+  // compute physical (x,y) locations of the element nodes
+  mesh.PhysicalNodes();
 
-  // make an ogs operator and label local/global gather elements
-  mesh->ParallelGatherScatterSetup();
+  // compute geometric factors
+  mesh.GeometricFactors();
 
-  mesh->OccaSetup();
+  // compute surface geofacs
+  mesh.SurfaceGeometricFactors();
+
+  // label local/global gather elements
+  mesh.GatherScatterSetup();
 
   return mesh;
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshSetupSEMFEM.cpp b/libs/mesh/meshSetupSEMFEM.cpp
index 1d0f3b438..f325c370d 100644
--- a/libs/mesh/meshSetupSEMFEM.cpp
+++ b/libs/mesh/meshSetupSEMFEM.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,170 +25,95 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
-#include "mesh/mesh3D.hpp"
 
-mesh_t* mesh_t::SetupSEMFEM(hlong **globalIds_, int *Nfp_, int **faceNodes_){
+namespace libp {
+
+mesh_t mesh_t::SetupSEMFEM(memory<hlong>& globalIds_,
+                           memory<int>& mapB_){
 
   //partially assembled fem mesh (result of projecting sem element to larger space)
-  mesh_t *pmesh=NULL;
-  switch(elementType){
-  //quads and hexes reuse the SEM ndoes for the FEM problem
-  case QUADRILATERALS:
-    pmesh = this; break;
-  case TETRAHEDRA:
-    pmesh = this; break;
-  case HEXAHEDRA:
-    pmesh = this; break;
-  case TRIANGLES:
-    if(dim==2)
-      pmesh = new meshTri2D(platform, settings, comm);
-    else
-      pmesh = new meshTri3D(platform, settings, comm);
-    break;
-  }
+  mesh_t pmesh=*this;
 
   //setup the intermediate mesh for tris and tets
-  if (elementType==TRIANGLES) {
-    pmesh->dim           = dim;
-    pmesh->elementType   = elementType;
-    pmesh->Nverts        = Nverts;
-    pmesh->Nfaces        = Nfaces;
-    pmesh->NfaceVertices = NfaceVertices;
-    pmesh->faceVertices  = faceVertices;
-
+  if (elementType==Mesh::TRIANGLES) {
     /* SEMFEM data */
-    SEMFEMNodesTri2D(N, &NpFEM, &rFEM, &sFEM);
-    SEMFEMEToVTri2D(N, &NelFEM, &FEMEToV);
+    SEMFEMNodesTri2D(N, NpFEM, rFEM, sFEM);
+    SEMFEMEToVTri2D(N, NelFEM, FEMEToV);
 
-    SEMFEMInterp = (dfloat*) calloc(NpFEM*Np, sizeof(dfloat));
-    SEMFEMInterpMatrixTri2D(N, Np, r, s, NpFEM, rFEM, sFEM, SEMFEMInterp);
+    SEMFEMInterpMatrixTri2D(N, r, s, rFEM, sFEM, SEMFEMInterp);
 
     //set semfem nodes as the grid points
-    pmesh->Np = NpFEM;
-    pmesh->r  = rFEM;
-    pmesh->s  = sFEM;
+    pmesh.Np = NpFEM;
+    pmesh.r  = rFEM;
+    pmesh.s  = sFEM;
 
     //count number of face nodes in the semfem element
     dfloat NODETOL = 1e-6;
-    pmesh->Nfp=0;
-    for (int n=0;n<pmesh->Np;n++)
-      if (fabs(pmesh->s[n]+1)<NODETOL) pmesh->Nfp++;
+    pmesh.Nfp=0;
+    for (int n=0;n<pmesh.Np;n++)
+      if (std::abs(pmesh.s[n]+1)<NODETOL) pmesh.Nfp++;
 
     //remake the faceNodes array
-    pmesh->faceNodes = (int *) calloc(Nfaces*pmesh->Nfp,sizeof(int));
+    pmesh.faceNodes.malloc(Nfaces*pmesh.Nfp);
     int f0=0, f1=0, f2=0;
-    for (int n=0;n<pmesh->Np;n++) {
-      if (fabs(pmesh->s[n]+1)<NODETOL)           pmesh->faceNodes[0*pmesh->Nfp+f0++] = n;
-      if (fabs(pmesh->r[n]+pmesh->s[n])<NODETOL) pmesh->faceNodes[1*pmesh->Nfp+f1++] = n;
-      if (fabs(pmesh->r[n]+1)<NODETOL)           pmesh->faceNodes[2*pmesh->Nfp+f2++] = n;
+    for (int n=0;n<pmesh.Np;n++) {
+      if (std::abs(pmesh.s[n]+1)<NODETOL)          pmesh.faceNodes[0*pmesh.Nfp+f0++] = n;
+      if (std::abs(pmesh.r[n]+pmesh.s[n])<NODETOL) pmesh.faceNodes[1*pmesh.Nfp+f1++] = n;
+      if (std::abs(pmesh.r[n]+1)<NODETOL)          pmesh.faceNodes[2*pmesh.Nfp+f2++] = n;
     }
 
     //remake vertexNodes array
-    pmesh->vertexNodes = (int*) calloc(Nverts, sizeof(int));
-    for(int n=0;n<pmesh->Np;++n){
-      if( (pmesh->r[n]+1)*(pmesh->r[n]+1)+(pmesh->s[n]+1)*(pmesh->s[n]+1)<NODETOL)
-        pmesh->vertexNodes[0] = n;
-      if( (pmesh->r[n]-1)*(pmesh->r[n]-1)+(pmesh->s[n]+1)*(pmesh->s[n]+1)<NODETOL)
-        pmesh->vertexNodes[1] = n;
-      if( (pmesh->r[n]+1)*(pmesh->r[n]+1)+(pmesh->s[n]-1)*(pmesh->s[n]-1)<NODETOL)
-        pmesh->vertexNodes[2] = n;
+    pmesh.vertexNodes.malloc(Nverts);
+    for(int n=0;n<pmesh.Np;++n){
+      if( (pmesh.r[n]+1)*(pmesh.r[n]+1)+(pmesh.s[n]+1)*(pmesh.s[n]+1)<NODETOL)
+        pmesh.vertexNodes[0] = n;
+      if( (pmesh.r[n]-1)*(pmesh.r[n]-1)+(pmesh.s[n]+1)*(pmesh.s[n]+1)<NODETOL)
+        pmesh.vertexNodes[1] = n;
+      if( (pmesh.r[n]+1)*(pmesh.r[n]+1)+(pmesh.s[n]-1)*(pmesh.s[n]-1)<NODETOL)
+        pmesh.vertexNodes[2] = n;
     }
 
-    // use existing mesh connectivity
-    pmesh->Nnodes = Nnodes;
-    pmesh->EX = EX; // coordinates of vertices for each element
-    pmesh->EY = EY;
-    pmesh->EZ = EZ;
-
-    pmesh->Nelements = Nelements;
-    pmesh->NelementsGlobal = NelementsGlobal;
-    pmesh->EToV = EToV; // element-to-vertex connectivity
-    pmesh->EToE = EToE; // element-to-element connectivity
-    pmesh->EToF = EToF; // element-to-(local)face connectivity
-    pmesh->EToP = EToP; // element-to-partition/process connectivity
-    pmesh->EToB = EToB; // element-to-boundary condition type
-
-    pmesh->elementInfo = elementInfo;
-
-    pmesh->NboundaryFaces = NboundaryFaces;
-    pmesh->boundaryInfo   = boundaryInfo;
-
-    //use existing halo
-    pmesh->halo = halo;
-    pmesh->NinternalElements = NinternalElements;
-    pmesh->NhaloElements = NhaloElements;
-    pmesh->totalHaloPairs = totalHaloPairs;
-    pmesh->internalElementIds = internalElementIds;
-    pmesh->haloElementIds = haloElementIds;
-
     // compute physical (x,y) locations FEM vertices
-    pmesh->PhysicalNodes();
+    pmesh.PhysicalNodes();
 
     // connect face nodes (find trace indices)
-    pmesh->ConnectFaceNodes();
+    pmesh.ConnectFaceNodes();
 
     // make a global indexing
-    pmesh->ParallelConnectNodes();
-    //pmesh->globalIds is now populated
-  }
-
-  //need to return this data
-  *globalIds_ = pmesh->globalIds;
-  *Nfp_ = pmesh->Nfp;
-  *faceNodes_ = pmesh->faceNodes;
-
-  //now build the full degree 1 fem mesh
-  mesh_t *femMesh=NULL;
-  switch(elementType){
-  case TRIANGLES:
-    if(dim==2)
-      femMesh = new meshTri2D(platform, settings, comm);
-    else
-      femMesh = new meshTri3D(platform, settings, comm);
-    break;
-  case QUADRILATERALS:
-    if(dim==2)
-      femMesh = new meshQuad2D(platform, settings, comm);
-    else
-      femMesh = new meshQuad3D(platform, settings, comm);
+    pmesh.ConnectNodes();
+    //pmesh.globalIds is now populated
+    //pmesh.mapB is now populated
+  } else if (elementType==Mesh::QUADRILATERALS) {
     NpFEM = Np;
     NelFEM = N*N;
-    FEMEToV = (int*) malloc(NelFEM*Nverts*sizeof(int));
     SEMFEMEToVQuad2D(N, FEMEToV);
-    break;
-  case TETRAHEDRA:
-    femMesh = new meshTet3D(platform, settings, comm);
+  } else if (elementType==Mesh::TETRAHEDRA){
     NpFEM = Np;
     NelFEM = N*N*N;
-    FEMEToV = (int*) malloc(NelFEM*Nverts*sizeof(int));
     SEMFEMEToVTet3D(N, FEMEToV);
-    break;
-  case HEXAHEDRA:
-    femMesh = new meshHex3D(platform, settings, comm);
+  } else { //Mesh::HEXAHEDRA
     NpFEM = Np;
     NelFEM = N*N*N;
-    FEMEToV = (int*) malloc(NelFEM*Nverts*sizeof(int));
     SEMFEMEToVHex3D(N, FEMEToV);
-    break;
   }
 
-  int femN = 1; //degree of fem approximation
-  femMesh->dim           = dim;
-  femMesh->elementType   = elementType;
-  femMesh->Nverts        = Nverts;
-  femMesh->Nfaces        = Nfaces;
-  femMesh->NfaceVertices = NfaceVertices;
-  femMesh->faceVertices  = faceVertices;
+  //need to return this data
+  globalIds_ = pmesh.globalIds;
+  mapB_ = pmesh.mapB;
+
+  //now build the full degree 1 fem mesh
+  mesh_t femMesh=*this;
+
+  femMesh.N = 1; //degree of fem approximation
 
   /* allocate space for node coordinates */
-  femMesh->Nelements = NelFEM*Nelements;
-  dlong NFEMverts = femMesh->Nelements*Nverts;
-  femMesh->EToV = (hlong*) calloc(NFEMverts, sizeof(hlong));
-  femMesh->EX = (dfloat*) calloc(NFEMverts, sizeof(dfloat));
-  femMesh->EY = (dfloat*) calloc(NFEMverts, sizeof(dfloat));
+  femMesh.Nelements = NelFEM*Nelements;
+  dlong NFEMverts = femMesh.Nelements*Nverts;
+  femMesh.EToV.malloc(NFEMverts);
+  femMesh.EX.malloc(NFEMverts);
+  femMesh.EY.malloc(NFEMverts);
   if (dim==3)
-    femMesh->EZ = (dfloat*) calloc(NFEMverts, sizeof(dfloat));
+    femMesh.EZ.malloc(NFEMverts);
 
   for(dlong e=0;e<Nelements;++e){
     for (int n=0;n<NelFEM;n++) {
@@ -199,91 +124,91 @@ mesh_t* mesh_t::SetupSEMFEM(hlong **globalIds_, int *Nfp_, int **faceNodes_){
         dlong id = e*NpFEM + FEMEToV[n*Nverts+i];
 
         /* read vertex triplet for triangle */
-        femMesh->EToV[femId+i] = pmesh->globalIds[id];
+        femMesh.EToV[femId+i] = pmesh.globalIds[id];
 
-        femMesh->EX[femId+i] = pmesh->x[id];
-        femMesh->EY[femId+i] = pmesh->y[id];
+        femMesh.EX[femId+i] = pmesh.x[id];
+        femMesh.EY[femId+i] = pmesh.y[id];
         if (dim==3)
-          femMesh->EZ[femId+i] = pmesh->z[id];
+          femMesh.EZ[femId+i] = pmesh.z[id];
       }
     }
   }
 
-  // connect elements using parallel sort
-  femMesh->ParallelConnect();
-
   // load reference (r,s) element nodes
-  femMesh->ReferenceNodes(femN);
+  femMesh.ReferenceNodes();
+
+  // connect elements using parallel sort
+  femMesh.Connect();
 
   //identify the nodes on the SEMFEM element faces
-  int *faceFlag = (int*) calloc(pmesh->Np*Nfaces,sizeof(int));
+  memory<int> faceFlag(pmesh.Np*Nfaces, 0);
   for (int f=0;f<Nfaces;f++) {
-    for (int n=0;n<pmesh->Nfp;n++) {
-      int id = pmesh->faceNodes[f*pmesh->Nfp+n];
-      faceFlag[f*pmesh->Np + id] = 1; //flag the nodes on this face
+    for (int n=0;n<pmesh.Nfp;n++) {
+      int id = pmesh.faceNodes[f*pmesh.Nfp+n];
+      faceFlag[f*pmesh.Np + id] = 1; //flag the nodes on this face
     }
   }
 
   //map from faces of fem sub-elements to the macro element face number
-  int *femFaceMap = (int*) calloc(NelFEM*femMesh->Nfaces,sizeof(int));
-  for (int n=0;n<NelFEM*femMesh->Nfaces;n++) femFaceMap[n] = -1;
+  memory<int> femFaceMap(NelFEM*femMesh.Nfaces, 0);
+  for (int n=0;n<NelFEM*femMesh.Nfaces;n++) femFaceMap[n] = -1;
 
   for (int n=0;n<NelFEM;n++) {
-    for (int f=0;f<femMesh->Nfaces;f++) {
+    for (int f=0;f<femMesh.Nfaces;f++) {
 
       for (int face=0; face<Nfaces;face++) {
 
         //count the nodes on this face which are on a macro face
         int NvertsOnFace = 0;
-        for (int i=0;i<femMesh->Nfp;i++){
-          int id = femMesh->faceNodes[f*femMesh->Nfp+i];
+        for (int i=0;i<femMesh.Nfp;i++){
+          int id = femMesh.faceNodes[f*femMesh.Nfp+i];
           int v  = FEMEToV[n*Nverts+id];
-          NvertsOnFace += faceFlag[face*pmesh->Np + v];
+          NvertsOnFace += faceFlag[face*pmesh.Np + v];
         }
-        if (NvertsOnFace == femMesh->Nfp)
-          femFaceMap[n*femMesh->Nfaces+f] = face; //on macro face
+        if (NvertsOnFace == femMesh.Nfp)
+          femFaceMap[n*femMesh.Nfaces+f] = face; //on macro face
       }
     }
   }
 
   //fill the boundary flag array from the original EToB
-  femMesh->EToB = (int*) calloc(femMesh->Nelements*femMesh->Nfaces, sizeof(int));
+  femMesh.EToB.malloc(femMesh.Nelements*femMesh.Nfaces, 0);
   for (dlong e=0;e<Nelements;e++) {
     for (int n=0;n<NelFEM;n++) {
-      for (int f=0;f<femMesh->Nfaces;f++) {
-        int face = femFaceMap[n*femMesh->Nfaces+f];
+      for (int f=0;f<femMesh.Nfaces;f++) {
+        int face = femFaceMap[n*femMesh.Nfaces+f];
         if (face>-1) {
-          femMesh->EToB[(e*NelFEM +n)*femMesh->Nfaces +f] = EToB[e*Nfaces + face];
+          femMesh.EToB[(e*NelFEM +n)*femMesh.Nfaces +f] = EToB[e*Nfaces + face];
         }
       }
     }
   }
-  free(faceFlag);
-  free(femFaceMap);
 
   // set up halo exchange info for MPI (do before connect face nodes)
-  femMesh->HaloSetup();
+  femMesh.HaloSetup();
 
-  // compute physical (x,y) locations of the element nodes
-  femMesh->PhysicalNodes();
-
-  // compute geometric factors
-  femMesh->GeometricFactors();
+  // connect face vertices
+  femMesh.ConnectFaceVertices();
 
   // connect face nodes (find trace indices)
-  // femMesh->ConnectFaceNodes();
+  femMesh.ConnectFaceNodes();
 
-  // compute surface geofacs
-  // femMesh->SurfaceGeometricFactors();
+  // make global indexing
+  femMesh.ConnectNodes();
+
+  // compute physical (x,y) locations of the element nodes
+  femMesh.PhysicalNodes();
 
-  // make a global indexing
-  //femMesh->ParallelConnectNodes();
+  // compute geometric factors
+  femMesh.GeometricFactors();
 
-  // make an ogs operator and label local/global gather elements
-  //femMesh->ParallelGatherScatterSetup();
+  // compute surface geofacs
+  // femMesh.SurfaceGeometricFactors();
 
-  //dont need to setup occa buffers for this mesh
-  // femMesh->OccaSetup();
+  // label local/global gather elements
+  femMesh.GatherScatterSetup();
 
   return femMesh;
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshSurfaceGeometricFactorsHex3D.cpp b/libs/mesh/meshSurfaceGeometricFactorsHex3D.cpp
index eb02b7f45..616eafb1d 100644
--- a/libs/mesh/meshSurfaceGeometricFactorsHex3D.cpp
+++ b/libs/mesh/meshSurfaceGeometricFactorsHex3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,72 +25,49 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
 
-/*
-static void computeFrame(dfloat nx, dfloat ny, dfloat nz,
-                  dfloat &tanx, dfloat &tany, dfloat &tanz,
-                  dfloat &binx, dfloat &biny, dfloat &binz){
-
-  dfloat rdotn, ranx, rany, ranz;
-  do{
-    ranx = drand48();
-    rany = drand48();
-    ranz = drand48();
-
-    dfloat magran = sqrt(ranx*ranx+rany*rany+ranz*ranz);
-
-    ranx /= magran;
-    rany /= magran;
-    ranz /= magran;
-
-    rdotn = nx*ranx+ny*rany+nz*ranz;
-  }while(fabs(rdotn)<1e-4);
-
-  tanx = ny*ranz - nz*rany;
-  tany = nz*ranx - nx*ranz;
-  tanz = nx*rany - ny*ranx;
-
-  dfloat magtan = sqrt(tanx*tanx+tany*tany+tanz*tanz);
-
-  tanx /= magtan;
-  tany /= magtan;
-  tanz /= magtan;
-
-  binx = ny*tanz - nz*tany;
-  biny = nz*tanx - nx*tanz;
-  binz = nx*tany - ny*tanx;
-
-  dfloat magbin = sqrt(binx*binx+biny*biny+binz*binz);
-
-  binx /= magbin;
-  biny /= magbin;
-  binz /= magbin;
-
-  //  printf("nor = %g,%g,%g; tan = %g,%g,%g; bin = %g,%g,%g\n", nx, ny, nz, tanx, tany, tanz, binx, biny, binz);
-}
-*/
+namespace libp {
 
 /* compute outwards facing normals, surface Jacobian, and volume Jacobian for all face nodes */
-void meshHex3D::SurfaceGeometricFactors(){
+void mesh_t::SurfaceGeometricFactorsHex3D(){
 
   /* unified storage array for geometric factors */
-  Nsgeo = 8; //17; (old)
-  sgeo = (dfloat*) calloc((Nelements+totalHaloPairs)*
-                                Nsgeo*Nfp*Nfaces,
-                                sizeof(dfloat));
-
-  dfloat *xre = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *xse = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *xte = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *yre = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *yse = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *yte = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *zre = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *zse = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *zte = (dfloat*) calloc(Np, sizeof(dfloat));
-
-  for(dlong e=0;e<Nelements+totalHaloPairs;++e){ /* for each element */
+  Nsgeo = 8;
+
+  NXID  = 0;
+  NYID  = 1;
+  NZID  = 2;
+  SJID  = 3;
+  IJID  = 4;
+  IHID  = 5;
+  WSJID = 6;
+  WIJID = 7;
+
+  props["defines/" "p_Nsgeo"]= Nsgeo;
+  props["defines/" "p_NXID"]= NXID;
+  props["defines/" "p_NYID"]= NYID;
+  props["defines/" "p_NZID"]= NZID;
+  props["defines/" "p_SJID"]= SJID;
+  props["defines/" "p_IJID"]= IJID;
+  props["defines/" "p_IHID"]= IHID;
+  props["defines/" "p_WSJID"]= WSJID;
+  props["defines/" "p_WIJID"]= WIJID;
+
+  sgeo.malloc(Nelements*Nsgeo*Nfp*Nfaces);
+
+  memory<dfloat> h((Nelements+totalHaloPairs)*Nfp*Nfaces);
+
+  memory<dfloat> xre(Np);
+  memory<dfloat> xse(Np);
+  memory<dfloat> xte(Np);
+  memory<dfloat> yre(Np);
+  memory<dfloat> yse(Np);
+  memory<dfloat> yte(Np);
+  memory<dfloat> zre(Np);
+  memory<dfloat> zse(Np);
+  memory<dfloat> zte(Np);
+
+  for(dlong e=0;e<Nelements;++e){ /* for each element */
 
     for(int k=0;k<Nq;++k){
       for(int j=0;j<Nq;++j){
@@ -162,8 +139,10 @@ void meshHex3D::SurfaceGeometricFactors(){
         sgeo[base+SJID] = sJ;
         sgeo[base+IJID] = 1./J;
 
-        sgeo[base+WIJID] = 1./(J*w[0]);
-        sgeo[base+WSJID] = sJ*w[i%Nq]*w[i/Nq];
+        sgeo[base+WIJID] = 1./(J*gllw[0]);
+        sgeo[base+WSJID] = sJ*gllw[i%Nq]*gllw[i/Nq];
+
+        h[Nfaces*Nfp*e + Nfp*f + i] = sJ/J;
 
         // computeFrame(nx, ny, nz,
         //              sgeo[base+STXID], sgeo[base+STYID], sgeo[base+STZID],
@@ -172,19 +151,28 @@ void meshHex3D::SurfaceGeometricFactors(){
     }
   }
 
+  halo.Exchange(h, Nfp*Nfaces);
+
   for(dlong e=0;e<Nelements;++e){ /* for each non-halo element */
-    for(int n=0;n<Nfp*Nfaces;++n){
-      dlong baseM = e*Nfp*Nfaces + n;
-      dlong baseP = mapP[baseM];
-      // rescaling - missing factor of 2 ? (only impacts penalty and thus stiffness)
-      dfloat hinvM = sgeo[baseM*Nsgeo + SJID]*sgeo[baseM*Nsgeo + IJID];
-      dfloat hinvP = sgeo[baseP*Nsgeo + SJID]*sgeo[baseP*Nsgeo + IJID];
-      sgeo[baseM*Nsgeo+IHID] = mymax(hinvM,hinvP);
-      sgeo[baseP*Nsgeo+IHID] = mymax(hinvM,hinvP);
+    for(int f=0;f<Nfaces;++f){
+      for(int n=0;n<Nfp;++n){
+        dlong baseM = e*Nfp*Nfaces + f*Nfp + n;
+        dlong baseP = mapP[baseM];
+        if(baseP<0) baseP = baseM;
+
+        // rescaling - missing factor of 2 ? (only impacts penalty and thus stiffness)
+        dfloat hinvM = h[baseM];
+        dfloat hinvP = h[baseP];
+        sgeo[baseM*Nsgeo+IHID] = std::max(hinvM,hinvP);
+
+        // if (EToB[f+e*Nfaces] > 0) { //enforce a stronger penalty on boundaries
+        //   sgeo[baseM*Nsgeo+IHID] *= 2;
+        // }
+      }
     }
   }
 
-  free(xre); free(xse); free(xte);
-  free(yre); free(yse); free(yte);
-  free(zre); free(zse); free(zte);
+  o_sgeo = platform.malloc<dfloat>(sgeo);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshSurfaceGeometricFactorsQuad2D.cpp b/libs/mesh/meshSurfaceGeometricFactorsQuad2D.cpp
index 44d288d4b..4c5e40bad 100644
--- a/libs/mesh/meshSurfaceGeometricFactorsQuad2D.cpp
+++ b/libs/mesh/meshSurfaceGeometricFactorsQuad2D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,23 +25,42 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
+
+namespace libp {
 
 /* compute outwards facing normals, surface Jacobian, and volume Jacobian for all face nodes */
-void meshQuad2D::SurfaceGeometricFactors(){
+void mesh_t::SurfaceGeometricFactorsQuad2D(){
 
   /* unified storage array for geometric factors */
   Nsgeo = 7;
-  sgeo = (dfloat*) calloc((Nelements+totalHaloPairs)*
-                                Nsgeo*Nfp*Nfaces,
-                                sizeof(dfloat));
 
-  dfloat *xre = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *xse = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *yre = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *yse = (dfloat*) calloc(Np, sizeof(dfloat));
+  NXID  = 0;
+  NYID  = 1;
+  SJID  = 2;
+  IJID  = 3;
+  IHID  = 4;
+  WSJID = 5;
+  WIJID = 6;
+
+  props["defines/" "p_Nsgeo"]= Nsgeo;
+  props["defines/" "p_NXID"]= NXID;
+  props["defines/" "p_NYID"]= NYID;
+  props["defines/" "p_SJID"]= SJID;
+  props["defines/" "p_IJID"]= IJID;
+  props["defines/" "p_IHID"]= IHID;
+  props["defines/" "p_WSJID"]= WSJID;
+  props["defines/" "p_WIJID"]= WIJID;
+
+  sgeo.malloc(Nelements*Nsgeo*Nfp*Nfaces);
 
-  for(dlong e=0;e<Nelements+totalHaloPairs;++e){ /* for each element */
+  memory<dfloat> hinv((Nelements+totalHaloPairs)*Nfp*Nfaces);
+
+  memory<dfloat> xre(Np);
+  memory<dfloat> xse(Np);
+  memory<dfloat> yre(Np);
+  memory<dfloat> yse(Np);
+
+  for(dlong e=0;e<Nelements;++e){ /* for each element */
 
     for(int j=0;j<Nq;++j){
       for(int i=0;i<Nq;++i){
@@ -98,24 +117,36 @@ void meshQuad2D::SurfaceGeometricFactors(){
         sgeo[base+SJID] = sJ;
         sgeo[base+IJID] = 1./J;
 
-        sgeo[base+WIJID] = 1./(J*w[0]);
-        sgeo[base+WSJID] = sJ*w[i];
+        sgeo[base+WIJID] = 1./(J*gllw[0]);
+        sgeo[base+WSJID] = sJ*gllw[i];
+
+        hinv[Nfaces*Nfp*e + Nfp*f + i] = sJ/J;
       }
     }
   }
 
-  for(dlong e=0;e<Nelements;++e){ /* for each non-halo element */
-    for(int n=0;n<Nfp*Nfaces;++n){
-      dlong baseM = e*Nfp*Nfaces + n;
-      dlong baseP = mapP[baseM];
-      if(baseP<0) baseP = baseM;
-
-      // rescaling - missing factor of 2 ? (only impacts penalty and thus stiffness)
-      dfloat hinvM = sgeo[baseM*Nsgeo + SJID]*sgeo[baseM*Nsgeo + IJID];
-      dfloat hinvP = sgeo[baseP*Nsgeo + SJID]*sgeo[baseP*Nsgeo + IJID];
+  halo.Exchange(hinv, Nfp*Nfaces);
 
-      sgeo[baseM*Nsgeo+IHID] = mymax(hinvM,hinvP);
-      sgeo[baseP*Nsgeo+IHID] = mymax(hinvM,hinvP);
+  for(dlong e=0;e<Nelements;++e){ /* for each non-halo element */
+    for(int f=0;f<Nfaces;++f){
+      for(int n=0;n<Nfp;++n){
+        dlong baseM = e*Nfp*Nfaces + f*Nfp + n;
+        dlong baseP = mapP[baseM];
+        if(baseP<0) baseP = baseM;
+
+        // rescaling - missing factor of 2 ? (only impacts penalty and thus stiffness)
+        dfloat hinvM = hinv[baseM];
+        dfloat hinvP = hinv[baseP];
+        sgeo[baseM*Nsgeo+IHID] = std::max(hinvM,hinvP);
+
+        // if (EToB[f+e*Nfaces] > 0) { //enforce a stronger penalty on boundaries
+        //   sgeo[baseM*Nsgeo+IHID] *= 2;
+        // }
+      }
     }
   }
+
+  o_sgeo = platform.malloc<dfloat>(sgeo);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshSurfaceGeometricFactorsQuad3D.cpp b/libs/mesh/meshSurfaceGeometricFactorsQuad3D.cpp
index b1816330f..b2f201e31 100644
--- a/libs/mesh/meshSurfaceGeometricFactorsQuad3D.cpp
+++ b/libs/mesh/meshSurfaceGeometricFactorsQuad3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,43 +25,62 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
+
+namespace libp {
 
 /* compute outwards facing normals, surface Jacobian, and volume Jacobian for all face nodes */
-void meshQuad3D::SurfaceGeometricFactors(){
+void mesh_t::SurfaceGeometricFactorsQuad3D(){
 
   /* unified storage array for geometric factors */
-  Nsgeo = 14; // fix later
-  sgeo = (dfloat*) calloc((Nelements+totalHaloPairs)*
-                                Nsgeo*Nfp*Nfaces,
-                                sizeof(dfloat));
+  Nsgeo = 8;
+
+  NXID  = 0;
+  NYID  = 1;
+  NZID  = 2;
+  SJID  = 3;
+  IJID  = 4;
+  IHID  = 5;
+  WSJID = 6;
+  WIJID = 7;
+
+  props["defines/" "p_Nsgeo"]= Nsgeo;
+  props["defines/" "p_NXID"]= NXID;
+  props["defines/" "p_NYID"]= NYID;
+  props["defines/" "p_NZID"]= NZID;
+  props["defines/" "p_SJID"]= SJID;
+  props["defines/" "p_IJID"]= IJID;
+  props["defines/" "p_IHID"]= IHID;
+  props["defines/" "p_WSJID"]= WSJID;
+  props["defines/" "p_WIJID"]= WIJID;
+
+  sgeo.malloc(Nelements*Nsgeo*Nfp*Nfaces);
 
-  cubsgeo = (dfloat*) calloc((Nelements+totalHaloPairs)*
-                                Nsgeo*cubNq*Nfaces,
-                                sizeof(dfloat));
+  memory<dfloat> hinv((Nelements+totalHaloPairs)*Nfp*Nfaces);
 
-  dfloat *_cubx = (dfloat*) calloc((Nelements+totalHaloPairs)*
-                                  cubNq*Nfaces, sizeof(dfloat));
+  // cubsgeo.malloc(Nelements*Nsgeo*cubNfp*Nfaces);
 
-  dfloat *_cuby = (dfloat*) calloc((Nelements+totalHaloPairs)*
-                                  cubNq*Nfaces, sizeof(dfloat));
+  // dfloat *_cubx = (dfloat*) calloc((Nelements+totalHaloPairs)*
+  //                                 cubNq*Nfaces, sizeof(dfloat));
 
-  dfloat *_cubz = (dfloat*) calloc((Nelements+totalHaloPairs)*
-                                  cubNq*Nfaces, sizeof(dfloat));
+  // dfloat *_cuby = (dfloat*) calloc((Nelements+totalHaloPairs)*
+  //                                 cubNq*Nfaces, sizeof(dfloat));
 
+  // dfloat *_cubz = (dfloat*) calloc((Nelements+totalHaloPairs)*
+  //                                 cubNq*Nfaces, sizeof(dfloat));
 
 
-  dfloat *xr = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *yr = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *zr = (dfloat*) calloc(Np, sizeof(dfloat));
 
-  dfloat *xs = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *ys = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *zs = (dfloat*) calloc(Np, sizeof(dfloat));
+  memory<dfloat> xr(Np);
+  memory<dfloat> yr(Np);
+  memory<dfloat> zr(Np);
 
-  dfloat *J  = (dfloat*) calloc(Np, sizeof(dfloat));
+  memory<dfloat> xs(Np);
+  memory<dfloat> ys(Np);
+  memory<dfloat> zs(Np);
 
-  for(int e=0;e<Nelements+totalHaloPairs;++e){ /* for each element */
+  memory<dfloat> J(Np);
+
+  for(int e=0;e<Nelements;++e){ /* for each element */
 
     for(int j=0;j<Nq;++j){
       for(int i=0;i<Nq;++i){
@@ -155,11 +174,8 @@ void meshQuad3D::SurfaceGeometricFactors(){
         ny /= sJ;
         nz /= sJ;
 
-        if(sJ<1e-8) {
-                stringstream ss;
-                ss << "Negative J found at element " << e << "\n";
-                LIBP_ABORT(ss.str())
-        }
+        LIBP_ABORT("Negative J found at element " << e,
+                   sJ<1e-8);
 
         int base = Nsgeo*(e*Nq*Nfaces + n + f*Nq);
 
@@ -170,96 +186,98 @@ void meshQuad3D::SurfaceGeometricFactors(){
 
         sgeo[base+IJID] = 1./Jid;
 
-        sgeo[base+WIJID] = 1./(Jid*w[0]);
-        sgeo[base+WSJID] = sJ*w[n];
-      }
-    }
-
-    // interpolate geofacs to surface quadrature
-    for(int f=0;f<Nfaces;++f){
-
-      for(int n=0;n<cubNq;++n){
-        dfloat cxr = 0, cxs = 0, cx = 0;
-        dfloat cyr = 0, cys = 0, cy = 0;
-        dfloat czr = 0, czs = 0, cz = 0;
-
-        for(int i=0;i<Nq;++i){
-          int id = faceNodes[i+f*Nq];
-          dfloat cIni = cubInterp[n*Nq+i];
-          cxr += cIni*xr[id];
-          cxs += cIni*xs[id];
-          cyr += cIni*yr[id];
-          cys += cIni*ys[id];
-          czr += cIni*zr[id];
-          czs += cIni*zs[id];
-          cx  += cIni*x[id+e*Np];
-          cy  += cIni*y[id+e*Np];
-          cz  += cIni*z[id+e*Np];
-        }
-
-        _cubx[e*cubNq*Nfaces+f*cubNq + n] = cx;
-        _cuby[e*cubNq*Nfaces+f*cubNq + n] = cy;
-        _cubz[e*cubNq*Nfaces+f*cubNq + n] = cz;
-
-        dfloat Gx = cyr*czs - czr*cys;
-        dfloat Gy = czr*cxs - cxr*czs;
-        dfloat Gz = cxr*cys - cyr*cxs;
-        // dfloat cJ = sqrt(Gx*Gx+Gy*Gy+Gz*Gz);
-        dfloat volJ = cx*Gx + cy*Gy + cz*Gz; // xij*tx + yij*ty + zij*tz;
-        dfloat nx=0.0, ny=0.0, nz=0.0;
-
-        if(f==0){
-          nx = cyr*cz - czr*cy;
-          ny = czr*cx - cxr*cz;
-          nz = cxr*cy - cyr*cx;
-        }
-
-        if(f==1){
-          nx = cys*cz - czs*cy;
-          ny = czs*cx - cxs*cz;
-          nz = cxs*cy - cys*cx;
-        }
-
-        if(f==2){
-          nx = -cyr*cz + czr*cy;
-          ny = -czr*cx + cxr*cz;
-          nz = -cxr*cy + cyr*cx;
-        }
-
-        if(f==3){
-          nx = -cys*cz + czs*cy;
-          ny = -czs*cx + cxs*cz;
-          nz = -cxs*cy + cys*cx;
-        }
-
-        dfloat R = sqrt(cx*cx+cy*cy+cz*cz);
-
-        nx /= R;
-        ny /= R;
-        nz /= R;
-
-        dfloat sJ = sqrt(nx*nx+ny*ny+nz*nz);
+        sgeo[base+WIJID] = 1./(Jid*gllw[0]);
+        sgeo[base+WSJID] = sJ*gllw[n];
 
-        nx /= sJ;
-        ny /= sJ;
-        nz /= sJ;
-
-        if(sJ<1e-8) {
-                stringstream ss;
-                ss << "Negative J found at element " << e << "\n";
-                LIBP_ABORT(ss.str())
-        }
-
-        int base = Nsgeo*(e*cubNq*Nfaces + n + f*cubNq);
-
-        cubsgeo[base+NXID] = nx;
-        cubsgeo[base+NYID] = ny;
-        cubsgeo[base+NZID] = nz;
-        cubsgeo[base+SJID] = sJ;
-        cubsgeo[base+IHID] = sJ/volJ;
-        //      cubsgeo[base+WSJID] = sJ*cubw[n];
+        hinv[e*Nq*Nfaces + n + f*Nq] = sJ/Jid;
       }
     }
+
+    // // interpolate geofacs to surface quadrature
+    // for(int f=0;f<Nfaces;++f){
+
+    //   for(int n=0;n<cubNq;++n){
+    //     dfloat cxr = 0, cxs = 0, cx = 0;
+    //     dfloat cyr = 0, cys = 0, cy = 0;
+    //     dfloat czr = 0, czs = 0, cz = 0;
+
+    //     for(int i=0;i<Nq;++i){
+    //       int id = faceNodes[i+f*Nq];
+    //       dfloat cIni = cubInterp[n*Nq+i];
+    //       cxr += cIni*xr[id];
+    //       cxs += cIni*xs[id];
+    //       cyr += cIni*yr[id];
+    //       cys += cIni*ys[id];
+    //       czr += cIni*zr[id];
+    //       czs += cIni*zs[id];
+    //       cx  += cIni*x[id+e*Np];
+    //       cy  += cIni*y[id+e*Np];
+    //       cz  += cIni*z[id+e*Np];
+    //     }
+
+    //     _cubx[e*cubNq*Nfaces+f*cubNq + n] = cx;
+    //     _cuby[e*cubNq*Nfaces+f*cubNq + n] = cy;
+    //     _cubz[e*cubNq*Nfaces+f*cubNq + n] = cz;
+
+    //     dfloat Gx = cyr*czs - czr*cys;
+    //     dfloat Gy = czr*cxs - cxr*czs;
+    //     dfloat Gz = cxr*cys - cyr*cxs;
+    //     // dfloat cJ = sqrt(Gx*Gx+Gy*Gy+Gz*Gz);
+    //     dfloat volJ = cx*Gx + cy*Gy + cz*Gz; // xij*tx + yij*ty + zij*tz;
+    //     dfloat nx=0.0, ny=0.0, nz=0.0;
+
+    //     if(f==0){
+    //       nx = cyr*cz - czr*cy;
+    //       ny = czr*cx - cxr*cz;
+    //       nz = cxr*cy - cyr*cx;
+    //     }
+
+    //     if(f==1){
+    //       nx = cys*cz - czs*cy;
+    //       ny = czs*cx - cxs*cz;
+    //       nz = cxs*cy - cys*cx;
+    //     }
+
+    //     if(f==2){
+    //       nx = -cyr*cz + czr*cy;
+    //       ny = -czr*cx + cxr*cz;
+    //       nz = -cxr*cy + cyr*cx;
+    //     }
+
+    //     if(f==3){
+    //       nx = -cys*cz + czs*cy;
+    //       ny = -czs*cx + cxs*cz;
+    //       nz = -cxs*cy + cys*cx;
+    //     }
+
+    //     dfloat R = sqrt(cx*cx+cy*cy+cz*cz);
+
+    //     nx /= R;
+    //     ny /= R;
+    //     nz /= R;
+
+    //     dfloat sJ = sqrt(nx*nx+ny*ny+nz*nz);
+
+    //     nx /= sJ;
+    //     ny /= sJ;
+    //     nz /= sJ;
+
+    //     if(sJ<1e-8) {
+    //             stringstream ss;
+    //             ss << "Negative J found at element " << e << "\n";
+    //             LIBP_ABORT(ss.str())
+    //     }
+
+    //     int base = Nsgeo*(e*cubNq*Nfaces + n + f*cubNq);
+
+    //     cubsgeo[base+NXID] = nx;
+    //     cubsgeo[base+NYID] = ny;
+    //     cubsgeo[base+NZID] = nz;
+    //     cubsgeo[base+SJID] = sJ;
+    //     cubsgeo[base+IHID] = sJ/volJ;
+    //     //      cubsgeo[base+WSJID] = sJ*cubw[n];
+    //   }
+    // }
   }
 
 
@@ -289,6 +307,8 @@ void meshQuad3D::SurfaceGeometricFactors(){
 #endif
   // TW: omit 1/min(h) calculation
 
+  halo.Exchange(hinv, Nfp*Nfaces);
+
   for(dlong e=0;e<Nelements;++e){ /* for each non-halo element */
     for(int n=0;n<Nfp*Nfaces;++n){
       dlong baseM = e*Nfp*Nfaces + n;
@@ -296,61 +316,59 @@ void meshQuad3D::SurfaceGeometricFactors(){
       if(baseP<0) baseP = baseM;
 
       // rescaling - missing factor of 2 ? (only impacts penalty and thus stiffness)
-      dfloat hinvM = sgeo[baseM*Nsgeo + SJID]*sgeo[baseM*Nsgeo + IJID];
-      dfloat hinvP = sgeo[baseP*Nsgeo + SJID]*sgeo[baseP*Nsgeo + IJID];
-
-      //      printf("hinvM/P = %g,%g\n", hinvM, hinvP);
-
-      sgeo[baseM*Nsgeo+IHID] = mymax(hinvM,hinvP);
-      sgeo[baseP*Nsgeo+IHID] = mymax(hinvM,hinvP);
-    }
-  }
-
-  for(dlong e=0;e<Nelements;++e){ /* for each non-halo element */
-    for(int f=0;f<Nfaces;++f){
-      dlong eP = EToE[e*Nfaces+f];
-      dlong fP = EToF[e*Nfaces+f];
-
-      // dfloat maxhinv  = 0;
-      for(int n=0;n<cubNq;++n){
-        dlong idM = e*cubNq*Nfaces+f*cubNq+n;
-        dfloat cxM = _cubx[idM];
-        dfloat cyM = _cuby[idM];
-        dfloat czM = _cubz[idM];
-
-        dfloat mindist2;
-        int minidP = 0;
-        // jump through hoops to find neighbor cubature node
-        // [ not needed elsewhere since we interpolate consistently ]
-        dlong idP;
-        for(int m=0;m<cubNq;++m){
-          idP = eP*cubNq*Nfaces+fP*cubNq+m;
-
-          dfloat cxP = _cubx[idP];
-          dfloat cyP = _cuby[idP];
-          dfloat czP = _cubz[idP];
-
-          dfloat dist2 = pow(cxP-cxM,2)+pow(cyP-cyM,2)+pow(czP-czM,2);
-
-          if(m==0 || dist2<mindist2){
-            mindist2 = dist2;
-            minidP = m;
-          }
-        }
-
-        if(mindist2>1e-12)
-        printf("mindist2 = %g\n", mindist2);
-
-        idM = Nsgeo*( e*cubNq*Nfaces+ f*cubNq+n)+IHID;
-        idP = Nsgeo*(eP*cubNq*Nfaces+fP*cubNq+minidP)+IHID;
-
-        dfloat hinv = mymax(cubsgeo[idM],cubsgeo[idP]);
-        cubsgeo[idM] = hinv;
-        cubsgeo[idP] = hinv;
-
-      }
+      dfloat hinvM = hinv[baseM];
+      dfloat hinvP = hinv[baseP];
+      sgeo[baseM*Nsgeo+IHID] = std::max(hinvM,hinvP);
     }
   }
 
-
+  // for(dlong e=0;e<Nelements;++e){ /* for each non-halo element */
+  //   for(int f=0;f<Nfaces;++f){
+  //     dlong eP = EToE[e*Nfaces+f];
+  //     dlong fP = EToF[e*Nfaces+f];
+
+  //     // dfloat maxhinv  = 0;
+  //     for(int n=0;n<cubNq;++n){
+  //       dlong idM = e*cubNq*Nfaces+f*cubNq+n;
+  //       dfloat cxM = _cubx[idM];
+  //       dfloat cyM = _cuby[idM];
+  //       dfloat czM = _cubz[idM];
+
+  //       dfloat mindist2;
+  //       int minidP = 0;
+  //       // jump through hoops to find neighbor cubature node
+  //       // [ not needed elsewhere since we interpolate consistently ]
+  //       dlong idP;
+  //       for(int m=0;m<cubNq;++m){
+  //         idP = eP*cubNq*Nfaces+fP*cubNq+m;
+
+  //         dfloat cxP = _cubx[idP];
+  //         dfloat cyP = _cuby[idP];
+  //         dfloat czP = _cubz[idP];
+
+  //         dfloat dist2 = pow(cxP-cxM,2)+pow(cyP-cyM,2)+pow(czP-czM,2);
+
+  //         if(m==0 || dist2<mindist2){
+  //           mindist2 = dist2;
+  //           minidP = m;
+  //         }
+  //       }
+
+  //       if(mindist2>1e-12)
+  //       printf("mindist2 = %g\n", mindist2);
+
+  //       idM = Nsgeo*( e*cubNq*Nfaces+ f*cubNq+n)+IHID;
+  //       idP = Nsgeo*(eP*cubNq*Nfaces+fP*cubNq+minidP)+IHID;
+
+  //       dfloat hinv = mymax(cubsgeo[idM],cubsgeo[idP]);
+  //       cubsgeo[idM] = hinv;
+  //       cubsgeo[idP] = hinv;
+
+  //     }
+  //   }
+  // }
+
+  o_sgeo = platform.malloc<dfloat>(sgeo);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshSurfaceGeometricFactorsTet3D.cpp b/libs/mesh/meshSurfaceGeometricFactorsTet3D.cpp
index 727356ba9..02bbf7323 100644
--- a/libs/mesh/meshSurfaceGeometricFactorsTet3D.cpp
+++ b/libs/mesh/meshSurfaceGeometricFactorsTet3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,53 +25,34 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
 
-static void computeFrameTet3D(dfloat nx, dfloat ny, dfloat nz,
-		  dfloat &tanx, dfloat &tany, dfloat &tanz,
-		  dfloat &binx, dfloat &biny, dfloat &binz){
+namespace libp {
 
-  dfloat ranx = drand48();
-  dfloat rany = drand48();
-  dfloat ranz = drand48();
+void mesh_t::SurfaceGeometricFactorsTet3D(){
 
-  dfloat magran = sqrt(ranx*ranx+rany*rany+ranz*ranz);
-
-  ranx /= magran;
-  rany /= magran;
-  ranz /= magran;
-
-  tanx = ny*ranz - nz*rany;
-  tany = nz*ranx - nx*ranz;
-  tanz = nx*rany - ny*ranx;
-
-  dfloat magtan = sqrt(tanx*tanx+tany*tany+tanz*tanz);
-
-  tanx /= magtan;
-  tany /= magtan;
-  tanz /= magtan;
+  /* unified storage array for geometric factors */
+  Nsgeo = 6;
 
-  binx = ny*tanz - nz*tany;
-  biny = nz*tanx - nx*tanz;
-  binz = nx*tany - ny*tanx;
+  NXID  = 0;
+  NYID  = 1;
+  NZID  = 2;
+  SJID  = 3;
+  IJID  = 4;
+  IHID  = 5;
 
-  dfloat magbin = sqrt(binx*binx+biny*biny+binz*binz);
+  props["defines/" "p_Nsgeo"]= Nsgeo;
+  props["defines/" "p_NXID"]= NXID;
+  props["defines/" "p_NYID"]= NYID;
+  props["defines/" "p_NZID"]= NZID;
+  props["defines/" "p_SJID"]= SJID;
+  props["defines/" "p_IJID"]= IJID;
+  props["defines/" "p_IHID"]= IHID;
 
-  binx /= magbin;
-  biny /= magbin;
-  binz /= magbin;
+  sgeo.malloc(Nelements*Nsgeo*Nfaces);
 
-  //  printf("nor = %g,%g,%g; tan = %g,%g,%g; bin = %g,%g,%g\n", nx, ny, nz, tanx, tany, tanz, binx, biny, binz);
-}
+  memory<dfloat> hinv((Nelements+totalHaloPairs)*Nfaces);
 
-void meshTet3D::SurfaceGeometricFactors(){
-
-  /* unified storage array for geometric factors */
-  Nsgeo = 14;
-  sgeo = (dfloat*) calloc((Nelements+totalHaloPairs)*
-                            Nsgeo*Nfaces, sizeof(dfloat));
-
-  for(dlong e=0;e<Nelements+totalHaloPairs;++e){ /* for each element */
+  for(dlong e=0;e<Nelements;++e){ /* for each element */
 
     /* find vertex indices and physical coordinates */
     dlong id = e*Nverts;
@@ -91,11 +72,8 @@ void meshTet3D::SurfaceGeometricFactors(){
     dfloat sx = -(yr*zt - zr*yt)/J, sy =  (xr*zt - zr*xt)/J, sz = -(xr*yt - yr*xt)/J;
     dfloat tx =  (yr*zs - zr*ys)/J, ty = -(xr*zs - zr*xs)/J, tz =  (xr*ys - yr*xs)/J;
 
-    if(J<0) {
-      stringstream ss;
-      ss << "Negative J found at element " << e << "\n";
-      LIBP_ABORT(ss.str())
-    }
+    LIBP_ABORT("Negative J found at element " << e,
+               J<0);
 
     /* face 1 */
     dlong base = Nsgeo*Nfaces*e;
@@ -110,10 +88,7 @@ void meshTet3D::SurfaceGeometricFactors(){
     sgeo[base+SJID] = sJ1*J;
     sgeo[base+IJID] = 1./J;
 
-    // generate local tangent and binormal using random vector
-    computeFrameTet3D(nx1/sJ1, ny1/sJ1, nz1/sJ1,
-		 sgeo[base+STXID], sgeo[base+STYID], sgeo[base+STZID],
-		 sgeo[base+SBXID], sgeo[base+SBYID], sgeo[base+SBZID]);
+    hinv[Nfaces*e+0] = 0.5*sJ1;
 
     /* face 2 */
     base += Nsgeo;
@@ -128,10 +103,7 @@ void meshTet3D::SurfaceGeometricFactors(){
     sgeo[base+SJID] = sJ2*J;
     sgeo[base+IJID] = 1./J;
 
-    // generate local tangent and binormal using random vector
-    computeFrameTet3D(nx2/sJ2, ny2/sJ2, nz2/sJ2,
-		 sgeo[base+STXID], sgeo[base+STYID], sgeo[base+STZID],
-		 sgeo[base+SBXID], sgeo[base+SBYID], sgeo[base+SBZID]);
+    hinv[Nfaces*e+1] = 0.5*sJ2;
 
     /* face 3 */
     base += Nsgeo;
@@ -146,10 +118,7 @@ void meshTet3D::SurfaceGeometricFactors(){
     sgeo[base+SJID] = sJ3*J;
     sgeo[base+IJID] = 1./J;
 
-    // generate local tangent and binormal using random vector
-    computeFrameTet3D(nx3/sJ3, ny3/sJ3, nz3/sJ3,
-		 sgeo[base+STXID], sgeo[base+STYID], sgeo[base+STZID],
-		 sgeo[base+SBXID], sgeo[base+SBYID], sgeo[base+SBZID]);
+    hinv[Nfaces*e+2] = 0.5*sJ3;
 
     /* face 4 */
     base += Nsgeo;
@@ -164,10 +133,7 @@ void meshTet3D::SurfaceGeometricFactors(){
     sgeo[base+SJID] = sJ4*J;
     sgeo[base+IJID] = 1./J;
 
-    // generate local tangent and binormal using random vector
-    computeFrameTet3D(nx4/sJ4, ny4/sJ4, nz4/sJ4,
-		 sgeo[base+STXID], sgeo[base+STYID], sgeo[base+STZID],
-		 sgeo[base+SBXID], sgeo[base+SBYID], sgeo[base+SBZID]);
+    hinv[Nfaces*e+3] = 0.5*sJ4;
 
 #if 0
     printf("N1=(%g,%g,%g),sJ1=%g\n", nx1/sJ1,ny1/sJ1,nz1/sJ1,sJ1*J);
@@ -177,25 +143,32 @@ void meshTet3D::SurfaceGeometricFactors(){
 #endif
   }
 
-  for(dlong e=0;e<Nelements;++e){ /* for each non-halo element */
-    for(int f=0;f<Nfaces;++f){
-      dlong baseM = e*Nfaces + f;
+  halo.Exchange(hinv, Nfaces);
 
-      // awkward: (need to find eP,fP relative to bulk+halo)
-      dlong idP = vmapP[e*Nfp*Nfaces+f*Nfp+0];
-      dlong eP = (idP>=0) ? (idP/Np):e;
+  for(dlong eM=0;eM<Nelements;++eM){ /* for each non-halo element */
+    for(int fM=0;fM<Nfaces;++fM){
+      dlong eP = EToE[eM*Nfaces+fM];
 
-      int fP = EToF[baseM];
-      fP = (fP==-1) ? f:fP;
+      if (eP<0) eP = eM;
 
+      int fP = EToF[eM*Nfaces+fM];
+      if (fP<0) fP = fM;
+
+      dlong baseM = eM*Nfaces + fM;
       dlong baseP = eP*Nfaces + fP;
 
-      // rescaling,  V = A*h/3 => (J*4/3) = (sJ*2)*h/3 => h  = 2*J/sJ
-      dfloat hinvM = 0.5*sgeo[baseM*Nsgeo + SJID]*sgeo[baseM*Nsgeo + IJID];
-      dfloat hinvP = 0.5*sgeo[baseP*Nsgeo + SJID]*sgeo[baseP*Nsgeo + IJID];
+      // rescaling - A = L*h/2 => (J*2) = (sJ*2)*h/2 => h  = 2*J/sJ
+      dfloat hinvM = hinv[baseM];
+      dfloat hinvP = hinv[baseP];
+      sgeo[baseM*Nsgeo+IHID] = std::max(hinvM,hinvP);
 
-      sgeo[baseM*Nsgeo+IHID] = mymax(hinvM,hinvP);
-      sgeo[baseP*Nsgeo+IHID] = mymax(hinvM,hinvP);
+      // if (EToB[fM+eM*Nfaces] > 0) { //enforce a stronger penalty on boundaries
+      //   sgeo[baseM*Nsgeo+IHID] *= 2;
+      // }
     }
   }
+
+  o_sgeo = platform.malloc<dfloat>(sgeo);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshSurfaceGeometricFactorsTri2D.cpp b/libs/mesh/meshSurfaceGeometricFactorsTri2D.cpp
index 94de0bccd..80d6dc430 100644
--- a/libs/mesh/meshSurfaceGeometricFactorsTri2D.cpp
+++ b/libs/mesh/meshSurfaceGeometricFactorsTri2D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,17 +25,32 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh2D.hpp"
 
-void meshTri2D::SurfaceGeometricFactors(){
+namespace libp {
+
+void mesh_t::SurfaceGeometricFactorsTri2D(){
 
   /* unified storage array for geometric factors */
-  Nsgeo = 6;
-  sgeo = (dfloat*) calloc((Nelements+totalHaloPairs)*
-				Nsgeo*Nfaces,
-				sizeof(dfloat));
+  Nsgeo = 5;
+
+  NXID  = 0;
+  NYID  = 1;
+  SJID  = 2;
+  IJID  = 3;
+  IHID  = 4;
+
+  props["defines/" "p_Nsgeo"]= Nsgeo;
+  props["defines/" "p_NXID"]= NXID;
+  props["defines/" "p_NYID"]= NYID;
+  props["defines/" "p_SJID"]= SJID;
+  props["defines/" "p_IJID"]= IJID;
+  props["defines/" "p_IHID"]= IHID;
+
+  sgeo.malloc(Nelements*Nsgeo*Nfaces);
 
-  for(dlong e=0;e<Nelements+totalHaloPairs;++e){ /* for each element */
+  memory<dfloat> hinv((Nelements+totalHaloPairs)*Nfaces);
+
+  for(dlong e=0;e<Nelements;++e){ /* for each element */
 
     /* find vertex indices and physical coordinates */
     dlong id = e*Nverts;
@@ -50,11 +65,9 @@ void meshTri2D::SurfaceGeometricFactors(){
 
     /* compute geometric factors for affine coordinate transform*/
     dfloat J = 0.25*((xe2-xe1)*(ye3-ye1) - (xe3-xe1)*(ye2-ye1));
-    if(J<0) {
-      stringstream ss;
-      ss << "Negative J found at element " << e << "\n";
-      LIBP_ABORT(ss.str())
-    }
+
+    LIBP_ABORT("Negative J found at element " << e,
+               J<0);
 
     /* face 1 */
     dlong base = Nsgeo*Nfaces*e;
@@ -67,6 +80,8 @@ void meshTri2D::SurfaceGeometricFactors(){
     sgeo[base+SJID] = d1/2.;
     sgeo[base+IJID] = 1./J;
 
+    hinv[Nfaces*e+0] = 0.25*d1/J;
+
     /* face 2 */
     base += Nsgeo;
     dfloat nx2 = ye3-ye2;
@@ -78,6 +93,8 @@ void meshTri2D::SurfaceGeometricFactors(){
     sgeo[base+SJID] = d2/2.; // TW fixed bug d1=>d2
     sgeo[base+IJID] = 1./J;
 
+    hinv[Nfaces*e+1] = 0.25*d2/J;
+
     /* face 3 */
     base += Nsgeo;
     dfloat nx3 = ye1-ye3;
@@ -88,46 +105,45 @@ void meshTri2D::SurfaceGeometricFactors(){
     sgeo[base+NYID] = ny3/d3;
     sgeo[base+SJID] = d3/2.;
     sgeo[base+IJID] = 1./J;
+
+    hinv[Nfaces*e+2] = 0.25*d3/J;
   }
 
+  halo.Exchange(hinv, Nfaces);
 
-  dfloat href = 0.;
-  dfloat tol  = 1.;
-  for(dlong e=0;e<Nelements;++e){ /* for each non-halo element */
-    for(int f=0;f<Nfaces;++f){
-      dlong baseM = e*Nfaces + f;
+  // dfloat href = 0.;
+  // dfloat tol  = 1.;
+  // for(dlong e=0;e<Nelements;++e){ /* for each non-halo element */
+  //   for(int f=0;f<Nfaces;++f){
+  //     dlong baseM = e*Nfaces + f;
 
-      // rescaling - missing factor of 2 ? (only impacts penalty and thus stiffness)  A = L*h/2 => (J*2) = (sJ*2)*h/2 => h  = 2*J/sJ
-      dfloat hinvM = sgeo[baseM*Nsgeo + SJID]*sgeo[baseM*Nsgeo + IJID];
+  //     // rescaling - missing factor of 2 ? (only impacts penalty and thus stiffness)  A = L*h/2 => (J*2) = (sJ*2)*h/2 => h  = 2*J/sJ
+  //     dfloat hinvM = sgeo[baseM*Nsgeo + SJID]*sgeo[baseM*Nsgeo + IJID];
 
-      href = mymax(hinvM,href);
-    }
-  }
+  //     href = mymax(hinvM,href);
+  //   }
+  // }
 
-  for(dlong e=0;e<Nelements;++e){ /* for each non-halo element */
-    for(int f=0;f<Nfaces;++f){
-      dlong baseM = e*Nfaces + f;
+  for(dlong eM=0;eM<Nelements;++eM){ /* for each non-halo element */
+    for(int fM=0;fM<Nfaces;++fM){
+      dlong eP = EToE[eM*Nfaces+fM];
 
-      // awkward: (need to find eP,fP relative to bulk+halo)
-      dlong idP = vmapP[e*Nfp*Nfaces+f*Nfp+0];
-      dlong eP = (idP>=0) ? (idP/Np):e;
+      if (eP<0) eP = eM;
 
-      int fP = EToF[baseM];
-      fP = (fP==-1) ? f:fP;
+      int fP = EToF[eM*Nfaces+fM];
+      if (fP<0) fP = fM;
 
+      dlong baseM = eM*Nfaces + fM;
       dlong baseP = eP*Nfaces + fP;
 
       // rescaling - A = L*h/2 => (J*2) = (sJ*2)*h/2 => h  = 2*J/sJ
-      dfloat hinvM = 0.5*sgeo[baseM*Nsgeo + SJID]*sgeo[baseM*Nsgeo + IJID];
-      dfloat hinvP = 0.5*sgeo[baseP*Nsgeo + SJID]*sgeo[baseP*Nsgeo + IJID];
-
-      sgeo[baseM*Nsgeo+IHID] = mymax(hinvM,hinvP);
-      sgeo[baseP*Nsgeo+IHID] = mymax(hinvM,hinvP);
+      dfloat hinvM = hinv[baseM];
+      dfloat hinvP = hinv[baseP];
+      sgeo[baseM*Nsgeo+IHID] = std::max(hinvM,hinvP);
 
-      if (EToB[f+e*Nfaces] > 0) { //enforce a stronger penalty on boundaries
-        sgeo[baseM*Nsgeo+IHID] = mymax(sgeo[baseM*Nsgeo+IHID],tol*href);
-        sgeo[baseP*Nsgeo+IHID] = mymax(sgeo[baseP*Nsgeo+IHID],tol*href);
-      }
+      // if (EToB[fM+eM*Nfaces] > 0) { //enforce a stronger penalty on boundaries
+      //   sgeo[baseM*Nsgeo+IHID] *= 2;
+      // }
 #if 0
       printf("e=%d f=%d (eP=%d,fP=%d) nx=%5.4f, ny=%5.4f, sJ=%5.4f, invJ=%5.4f, hinv=%f\n"
 	     ,e,f,eP,fP,
@@ -140,4 +156,7 @@ void meshTri2D::SurfaceGeometricFactors(){
     }
   }
 
+  o_sgeo = platform.malloc<dfloat>(sgeo);
 }
+
+} //namespace libp
diff --git a/libs/mesh/meshSurfaceGeometricFactorsTri3D.cpp b/libs/mesh/meshSurfaceGeometricFactorsTri3D.cpp
index cbbc5eeb0..c4e184ed2 100644
--- a/libs/mesh/meshSurfaceGeometricFactorsTri3D.cpp
+++ b/libs/mesh/meshSurfaceGeometricFactorsTri3D.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,26 +25,43 @@ SOFTWARE.
 */
 
 #include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
+
+namespace libp {
 
 /* compute outwards facing normals, surface Jacobian, and volume Jacobian for all face nodes */
-void meshTri3D::SurfaceGeometricFactors(){
+void mesh_t::SurfaceGeometricFactorsTri3D(){
 
   /* unified storage array for geometric factors */
-  Nsgeo = 14;
-  sgeo = (dfloat*) calloc((Nelements+totalHaloPairs)*
-				Nsgeo*Nfp*Nfaces,
-				sizeof(dfloat));
+  Nsgeo = 6;
+
+  NXID  = 0;
+  NYID  = 1;
+  NZID  = 2;
+  SJID  = 3;
+  IJID  = 4;
+  IHID  = 5;
+
+  props["defines/" "p_Nsgeo"]= Nsgeo;
+  props["defines/" "p_NXID"]= NXID;
+  props["defines/" "p_NYID"]= NYID;
+  props["defines/" "p_NZID"]= NZID;
+  props["defines/" "p_SJID"]= SJID;
+  props["defines/" "p_IJID"]= IJID;
+  props["defines/" "p_IHID"]= IHID;
+
+  sgeo.malloc(Nelements*Nsgeo*Nfp*Nfaces);
 
-  dfloat *xr = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *yr = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *zr = (dfloat*) calloc(Np, sizeof(dfloat));
+  memory<dfloat> hinv((Nelements+totalHaloPairs)*Nfp*Nfaces);
 
-  dfloat *xs = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *ys = (dfloat*) calloc(Np, sizeof(dfloat));
-  dfloat *zs = (dfloat*) calloc(Np, sizeof(dfloat));
+  memory<dfloat> xr(Np);
+  memory<dfloat> yr(Np);
+  memory<dfloat> zr(Np);
 
-  dfloat *J  = (dfloat*) calloc(Np, sizeof(dfloat));
+  memory<dfloat> xs(Np);
+  memory<dfloat> ys(Np);
+  memory<dfloat> zs(Np);
+
+  memory<dfloat> J (Np);
 
   for(int e=0;e<Nelements+totalHaloPairs;++e){ /* for each element */
 
@@ -59,16 +76,16 @@ void meshTri3D::SurfaceGeometricFactors(){
 
       for(int m=0;m<Np;++m){
 
-	dfloat Drn = Dr[n*Np+m];
-	dfloat Dsn = Ds[n*Np+m];
+        dfloat Drn = Dr[n*Np+m];
+        dfloat Dsn = Ds[n*Np+m];
 
-	xrn += Drn*x[m+e*Np];
-	yrn += Drn*y[m+e*Np];
-	zrn += Drn*z[m+e*Np];
+        xrn += Drn*x[m+e*Np];
+        yrn += Drn*y[m+e*Np];
+        zrn += Drn*z[m+e*Np];
 
-	xsn += Dsn*x[m+e*Np];
-	ysn += Dsn*y[m+e*Np];
-	zsn += Dsn*z[m+e*Np];
+        xsn += Dsn*x[m+e*Np];
+        ysn += Dsn*y[m+e*Np];
+        zsn += Dsn*z[m+e*Np];
 
       }
 
@@ -93,82 +110,83 @@ void meshTri3D::SurfaceGeometricFactors(){
 
     for(int f=0;f<Nfaces;++f){
       for(int n=0;n<Nfp;++n){
-	int id = faceNodes[n+f*Nfp];
+        int id = faceNodes[n+f*Nfp];
+
+        dfloat xid = x[id+e*Np];
+        dfloat yid = y[id+e*Np];
+        dfloat zid = z[id+e*Np];
+        dfloat Jid = J[id];
 
-	dfloat xid = x[id+e*Np];
-	dfloat yid = y[id+e*Np];
-	dfloat zid = z[id+e*Np];
-	dfloat Jid = J[id];
+        dfloat nx=0.0, ny=0.0, nz=0.0;
 
-	dfloat nx=0.0, ny=0.0, nz=0.0;
+        if(f==0){
+          nx = yr[id]*zid - zr[id]*yid;
+          ny = zr[id]*xid - xr[id]*zid;
+          nz = xr[id]*yid - yr[id]*xid;
+        }
 
-	if(f==0){
-	  nx = yr[id]*zid - zr[id]*yid;
-	  ny = zr[id]*xid - xr[id]*zid;
-	  nz = xr[id]*yid - yr[id]*xid;
-	}
+        if(f==1){
+          nx = (ys[id]-yr[id])*zid - (zs[id]-zr[id])*yid;
+          ny = (zs[id]-zr[id])*xid - (xs[id]-xr[id])*zid;
+          nz = (xs[id]-xr[id])*yid - (ys[id]-yr[id])*xid;
+        }
 
-	if(f==1){
-	  nx = (ys[id]-yr[id])*zid - (zs[id]-zr[id])*yid;
-	  ny = (zs[id]-zr[id])*xid - (xs[id]-xr[id])*zid;
-	  nz = (xs[id]-xr[id])*yid - (ys[id]-yr[id])*xid;
-	}
+        if(f==2){
+          nx = -ys[id]*zid + zs[id]*yid;
+          ny = -zs[id]*xid + xs[id]*zid;
+          nz = -xs[id]*yid + ys[id]*xid;
+        }
 
-	if(f==2){
-	  nx = -ys[id]*zid + zs[id]*yid;
-	  ny = -zs[id]*xid + xs[id]*zid;
-	  nz = -xs[id]*yid + ys[id]*xid;
-	}
+        dfloat R = sqrt(xid*xid+yid*yid+zid*zid);
 
-	dfloat R = sqrt(xid*xid+yid*yid+zid*zid);
+        nx /= R;
+        ny /= R;
+        nz /= R;
 
-	nx /= R;
-	ny /= R;
-	nz /= R;
+        dfloat sJ = sqrt(nx*nx+ny*ny+nz*nz);
 
-	dfloat sJ = sqrt(nx*nx+ny*ny+nz*nz);
+        nx /= sJ;
+        ny /= sJ;
+        nz /= sJ;
 
-	nx /= sJ;
-	ny /= sJ;
-	nz /= sJ;
+        LIBP_ABORT("Negative J found at element " << e,
+                   sJ<1e-8);
 
-	if(sJ<1e-8) {
-		stringstream ss;
-		ss << "Negative J found at element " << e << "\n";
-		LIBP_ABORT(ss.str())
-	}
+        int base = e*Nfp*Nfaces*Nsgeo + n + f*Nfp;
 
-	int base = e*Nfp*Nfaces*Nsgeo + n + f*Nfp;
+        sgeo[base+Nfp*Nfaces*NXID] = nx;
+        sgeo[base+Nfp*Nfaces*NYID] = ny;
+        sgeo[base+Nfp*Nfaces*NZID] = nz;
+        sgeo[base+Nfp*Nfaces*SJID] = sJ;
 
-	sgeo[base+Nfp*Nfaces*NXID] = nx;
-	sgeo[base+Nfp*Nfaces*NYID] = ny;
-	sgeo[base+Nfp*Nfaces*NZID] = nz;
-	sgeo[base+Nfp*Nfaces*SJID] = sJ;
+        sgeo[base+Nfp*Nfaces*IJID] = 1./Jid;
 
-	sgeo[base+Nfp*Nfaces*IJID] = 1./Jid;
+        hinv[e*Nfp*Nfaces + n + f*Nfp] = 0.5*sJ/Jid;
       }
     }
   }
 
+  o_sgeo = platform.malloc<dfloat>(sgeo);
+
 #if 0
   for(int e=0;e<Nelements;++e){
     for(int f=0;f<Nfaces;++f){
       for(int n=0;n<Nfp;++n){
-	int idM = n+f*Nfp+e*Nfaces*Nfp;
-	int idP = mapP[idM];
-	int eP = idP/(Nfp*Nfaces);
-	int fP = (idP%(Nfp*Nfaces))/Nfp;
-	int nP = (idP%Nfp);
-	int baseM = e*Nfp*Nfaces*Nsgeo + f*Nfp + n;
-	int baseP = eP*Nfp*Nfaces*Nsgeo + fP*Nfp + nP;
-	printf("e,f,n=(%d,%d,%d)-(%d,%d,%d): xP-xM=(%g,%g,%g) : norP+norM=%g,%g,%g\n",
-	       e,f,n,eP,fP,nP,
-	       x[vmapP[idM]]-x[vmapM[idM]],
-	       y[vmapP[idM]]-y[vmapM[idM]],
-	       z[vmapP[idM]]-z[vmapM[idM]],
-	       sgeo[baseM+NXID*Nfp*Nfaces]+sgeo[baseP+NXID*Nfp*Nfaces],
-	       sgeo[baseM+NYID*Nfp*Nfaces]+sgeo[baseP+NYID*Nfp*Nfaces],
-	       sgeo[baseM+NZID*Nfp*Nfaces]+sgeo[baseP+NZID*Nfp*Nfaces]);
+        int idM = n+f*Nfp+e*Nfaces*Nfp;
+        int idP = mapP[idM];
+        int eP = idP/(Nfp*Nfaces);
+        int fP = (idP%(Nfp*Nfaces))/Nfp;
+        int nP = (idP%Nfp);
+        int baseM = e*Nfp*Nfaces*Nsgeo + f*Nfp + n;
+        int baseP = eP*Nfp*Nfaces*Nsgeo + fP*Nfp + nP;
+        printf("e,f,n=(%d,%d,%d)-(%d,%d,%d): xP-xM=(%g,%g,%g) : norP+norM=%g,%g,%g\n",
+               e,f,n,eP,fP,nP,
+               x[vmapP[idM]]-x[vmapM[idM]],
+               y[vmapP[idM]]-y[vmapM[idM]],
+               z[vmapP[idM]]-z[vmapM[idM]],
+               sgeo[baseM+NXID*Nfp*Nfaces]+sgeo[baseP+NXID*Nfp*Nfaces],
+               sgeo[baseM+NYID*Nfp*Nfaces]+sgeo[baseP+NYID*Nfp*Nfaces],
+               sgeo[baseM+NZID*Nfp*Nfaces]+sgeo[baseP+NZID*Nfp*Nfaces]);
 
       }
     }
@@ -176,3 +194,5 @@ void meshTri3D::SurfaceGeometricFactors(){
 #endif
   // TW: omit 1/min(h) calculation
 }
+
+} //namespace libp
diff --git a/libs/mesh/okl/MassMatrixOperatorHex3D.okl b/libs/mesh/okl/MassMatrixOperatorHex3D.okl
index e64a4c83a..ea57bdd29 100644
--- a/libs/mesh/okl/MassMatrixOperatorHex3D.okl
+++ b/libs/mesh/okl/MassMatrixOperatorHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,7 +26,7 @@ SOFTWARE.
 
 //spectral mass matrix
 @kernel void MassMatrixOperatorHex3D(const dlong Nelements,
-                                     @restrict const dfloat* ggeo,
+                                     @restrict const dfloat* wJ,
                                      @restrict const dfloat* MM,
                                      @restrict const dfloat* q,
                                      @restrict       dfloat* Mq){
@@ -36,10 +36,9 @@ SOFTWARE.
     for(int n=0;n<p_Np;++n;@inner(0)){
 
       const dlong qbase = e*p_Np*p_Nfields + n;
-      const dlong gbase = e*p_Nggeo*p_Np + n;
+      const dlong gbase = e*p_Np + n;
 
-      // assumes w*J built into G entries
-      const dfloat r_GwJ = ggeo[gbase+p_GWJID*p_Np];
+      const dfloat r_GwJ = wJ[gbase];
 
       #pragma unroll p_Nfields
       for (int f=0;f<p_Nfields;f++) {
diff --git a/libs/mesh/okl/MassMatrixOperatorQuad2D.okl b/libs/mesh/okl/MassMatrixOperatorQuad2D.okl
index 6bd1fbbbc..aad015863 100644
--- a/libs/mesh/okl/MassMatrixOperatorQuad2D.okl
+++ b/libs/mesh/okl/MassMatrixOperatorQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,7 +26,7 @@ SOFTWARE.
 
 //spectral mass matrix
 @kernel void MassMatrixOperatorQuad2D(const dlong Nelements,
-                                      @restrict const dfloat* ggeo,
+                                      @restrict const dfloat* wJ,
                                       @restrict const dfloat* MM,
                                       @restrict const dfloat* q,
                                       @restrict       dfloat* Mq){
@@ -36,10 +36,9 @@ SOFTWARE.
     for(int n=0;n<p_Np;++n;@inner(0)){
 
       const dlong qbase = e*p_Np*p_Nfields + n;
-      const dlong gbase = e*p_Nggeo*p_Np + n;
+      const dlong gbase = e*p_Np + n;
 
-      // assumes w*J built into G entries
-      const dfloat r_GwJ = ggeo[gbase+p_GWJID*p_Np];
+      const dfloat r_GwJ = wJ[gbase];
 
       #pragma unroll p_Nfields
       for (int f=0;f<p_Nfields;f++) {
diff --git a/libs/mesh/okl/MassMatrixOperatorTet3D.okl b/libs/mesh/okl/MassMatrixOperatorTet3D.okl
index 69da2f4e4..628aa90de 100644
--- a/libs/mesh/okl/MassMatrixOperatorTet3D.okl
+++ b/libs/mesh/okl/MassMatrixOperatorTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,7 +26,7 @@ SOFTWARE.
 
 
 @kernel void MassMatrixOperatorTet3D(const dlong Nelements,
-                                     @restrict const dfloat* ggeo,
+                                     @restrict const dfloat* wJ,
                                      @restrict const dfloat* MM,
                                      @restrict const dfloat* q,
                                      @restrict       dfloat* Mq){
@@ -44,11 +44,8 @@ SOFTWARE.
         s_q[f][n] = q[qbase+f*p_Np];
     }
 
-    @barrier("local");
-
     for(int n=0;n<p_Np;++n;@inner(0)){
-      const dlong gid = e*p_Nggeo;
-      const dfloat J = ggeo[gid + p_GWJID];
+      const dfloat J = wJ[e];
 
       dfloat r_qM[p_Nfields];
 
diff --git a/libs/mesh/okl/MassMatrixOperatorTri2D.okl b/libs/mesh/okl/MassMatrixOperatorTri2D.okl
index 39bb6ab6f..595c7cd5d 100644
--- a/libs/mesh/okl/MassMatrixOperatorTri2D.okl
+++ b/libs/mesh/okl/MassMatrixOperatorTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,7 +26,7 @@ SOFTWARE.
 
 
 @kernel void MassMatrixOperatorTri2D(const dlong Nelements,
-                                     @restrict const dfloat* ggeo,
+                                     @restrict const dfloat* wJ,
                                      @restrict const dfloat* MM,
                                      @restrict const dfloat* q,
                                      @restrict       dfloat* Mq){
@@ -44,11 +44,8 @@ SOFTWARE.
         s_q[f][n] = q[qbase+f*p_Np];
     }
 
-    @barrier("local");
-
     for(int n=0;n<p_Np;++n;@inner(0)){
-      const dlong gid = e*p_Nggeo;
-      const dfloat J = ggeo[gid + p_GWJID];
+      const dfloat J = wJ[e];
 
       dfloat r_qM[p_Nfields];
 
diff --git a/libs/ogs/gs.cpp b/libs/ogs/gs.cpp
deleted file mode 100644
index ee6ce75a6..000000000
--- a/libs/ogs/gs.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "ogs.hpp"
-#include "ogs/ogsKernels.hpp"
-
-extern "C" {
-#include "gslib.h"
-}
-
-namespace ogs {
-
-OGS_DEFINE_TYPE_SIZES()
-OGS_GS_DEFINE_TYPE_MAP()
-OGS_GS_DEFINE_OP_MAP()
-
-// MPI based gather scatter using libgs
-void gsGatherScatter(void* v,
-                     const dlong Nentries,
-                     const dlong Nvectors,
-                     const dlong stride,
-                     const ogs_type type,
-                     const ogs_op op,
-                     const ogs_transpose trans,
-                     void *gsh){
-
-  const gs_op  gsop  = ogs_gs_op_map[op];
-  const gs_dom gsdom = ogs_gs_type_map[type];
-  const int gstrans  = (trans == ogs_notrans) ? 0 : 1;
-  //call libgs (symmetric behaviour)
-  if (Nentries==1 && Nvectors==1)
-    gs(v, gsdom, gsop, gstrans, (gs_data*)gsh, 0);
-  else if (Nvectors==1)
-    gs_vec(v, Nentries, gsdom, gsop, gstrans, (gs_data*)gsh, 0);
-  else if (Nentries==1) {
-    const size_t Nbytes = ogs_type_size[type];
-    void* V[Nvectors];
-    for (int i=0;i<Nvectors;i++)
-      V[i] = (char*)v + i*stride*Nbytes;
-
-    gs_many(V, Nvectors, gsdom, gsop, gstrans, (gs_data*)gsh, 0);
-  }
-}
-
-//Setup a gslib struct
-void *gsSetup(MPI_Comm meshComm,
-              dlong NuniqueBases,
-              hlong *gatherGlobalNodes,
-              int unique, int verbose){
-
-  /* gslib stuff */
-  comm_ext world;
-  struct comm com;
-
-  /*  MPI_Comm_dup(MPI_COMM_WORLD, (MPI_Comm*) &world); */
-  world = (comm_ext)meshComm; // MPI_COMM_WORLD;
-
-  comm_init(&com, world);
-
-  /* for the moment borrow gslib array */
-  slong *id = tmalloc(slong, NuniqueBases);
-
-  dlong n;
-  for(n=0;n<NuniqueBases;++n){ /* at some point need to choose int */
-    id[n] = (slong) gatherGlobalNodes[n];
-  }
-
-  struct gs_data *gsh = gs_setup(id, NuniqueBases, &com, unique, gs_auto, verbose); // gs_auto, gs_crystal_router, gs_pw
-
-  free(id);
-
-  return gsh;
-}
-
-void gsUnique(hlong *gatherGlobalNodes,
-              dlong NuniqueBases,
-              MPI_Comm meshComm){
-
-  /* gslib stuff */
-  comm_ext world;
-  struct comm com;
-
-  /*  MPI_Comm_dup(MPI_COMM_WORLD, (MPI_Comm*) &world); */
-  world = (comm_ext)meshComm; // MPI_COMM_WORLD;
-
-  comm_init(&com, world);
-
-  /* for the moment borrow gslib array */
-  slong *id = tmalloc(slong, NuniqueBases);
-
-  dlong n;
-  for(n=0;n<NuniqueBases;++n){ /* at some point need to choose int */
-    id[n] = (slong) gatherGlobalNodes[n];
-  }
-
-  gs_unique(id, NuniqueBases, &com);
-
-  for(n=0;n<NuniqueBases;++n){ /* at some point need to choose int */
-    gatherGlobalNodes[n] = (hlong) id[n];
-  }
-
-  free(id);
-}
-
-void gsFree(void* gs) {
-  gs_free((gs_data*)gs);
-}
-
-} //namespace ogs
\ No newline at end of file
diff --git a/libs/ogs/hostGather.cpp b/libs/ogs/hostGather.cpp
deleted file mode 100644
index 813adda04..000000000
--- a/libs/ogs/hostGather.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "ogs.hpp"
-#include "ogs/ogsKernels.hpp"
-
-namespace ogs {
-
-OGS_DEFINE_TYPE_SIZES()
-OGS_FOR_EACH_TYPE(DEFINE_ADD_OGS_INIT)
-
-void hostGather(void* gv,
-                void* v,
-                const int Nentries,
-                const int Nvectors,
-                const dlong gstride,
-                const dlong stride,
-                const ogs_type type,
-                const ogs_op op,
-                const ogs_transpose trans,
-                ogs_t &ogs){
-
-  const size_t Nbytes = ogs_type_size[type];
-
-  if (trans == ogs_sym)
-    LIBP_ABORT(string("Calling ogs::Gather in ogs_sym mode not supported."))
-
-  ogs.reallocHostBuffer(Nbytes*Nentries*Nvectors);
-
-  dlong NhaloGather = (trans == ogs_notrans) ? ogs.haloGather.Nrows : ogs.haloScatter.Nrows;
-
-  // gather halo nodes
-  if (NhaloGather) {
-    if (trans == ogs_notrans)
-      hostGatherKernel(ogs.haloGather.Nrows, Nentries, Nvectors, stride, ogs.Nhalo,
-                       ogs.haloGather.rowStarts, ogs.haloGather.colIds,
-                       type, op, v, ogs.hostBuf);
-    else
-      hostGatherKernel(ogs.haloScatter.Nrows, Nentries, Nvectors, stride, ogs.Nhalo,
-                       ogs.haloScatter.rowStarts, ogs.haloScatter.colIds,
-                       type, op, v, ogs.hostBuf);
-  }
-
-  // MPI based gather using libgs
-  gsGatherScatter(ogs.hostBuf, Nentries, Nvectors, ogs.Nhalo,
-                  type, op, trans, ogs.gsh);
-
-  if (ogs.haloGather.Nrows)
-    for (int i=0;i<Nvectors;i++)
-      memcpy((char*)gv+ogs.localGather.Nrows*Nbytes*Nentries + gstride*Nbytes*i,
-             (char*)ogs.hostBuf+ogs.Nhalo*Nbytes*Nentries*i,
-             ogs.haloGather.Nrows*Nentries*Nbytes);
-
-  // gather interior nodes
-  if (ogs.Nlocal) {
-    if (trans == ogs_notrans)
-      hostGatherKernel(ogs.localGather.Nrows, Nentries, Nvectors, stride, gstride,
-                       ogs.localGather.rowStarts, ogs.localGather.colIds,
-                       type, op, v, gv);
-    else
-      hostGatherKernel(ogs.localScatter.Nrows, Nentries, Nvectors, stride, gstride,
-                       ogs.localScatter.rowStarts, ogs.localScatter.colIds,
-                       type, op, v, gv);
-  }
-}
-
-/*------------------------------------------------------------------------------
-  The basic gather kernel
-------------------------------------------------------------------------------*/
-#define DEFINE_GATHER(T,OP)                                                     \
-static void hostGatherKernel_##T##_##OP(const dlong N,                          \
-                                        const int   Nentries,                   \
-                                        const int   Nvectors,                   \
-                                        const dlong stride,                     \
-                                        const dlong gstride,                    \
-                                        const dlong *gatherStarts,              \
-                                        const dlong *gatherIds,                 \
-                                        const     T *q,                         \
-                                                  T *gatherq)                   \
-{                                                                               \
-  for(dlong n=0;n<N*Nentries*Nvectors;++n){                                     \
-    const int m     = n/(N*Nentries);                                           \
-    const dlong vid = n%(N*Nentries);                                           \
-    const dlong gid = vid/Nentries;                                             \
-    const int k     = vid%Nentries;                                             \
-    const dlong start = gatherStarts[gid];                                      \
-    const dlong end = gatherStarts[gid+1];                                      \
-    T gq = init_##T##_##OP;                                                     \
-    for(dlong g=start;g<end;++g){                                               \
-      const dlong id = gatherIds[g];                                            \
-      OGS_DO_##OP(gq,q[k+id*Nentries+m*stride]);                                \
-    }                                                                           \
-    gatherq[k+gid*Nentries+m*gstride] = gq;                                     \
-  }                                                                             \
-}
-
-#define DEFINE_PROCS(T) \
-  OGS_FOR_EACH_OP(T,DEFINE_GATHER)
-
-OGS_FOR_EACH_TYPE(DEFINE_PROCS)
-
-#define SWITCH_TYPE_CASE(T) case ogs_##T: { WITH_TYPE(T); break; }
-#define SWITCH_TYPE(type) switch(type) { \
-    OGS_FOR_EACH_TYPE(SWITCH_TYPE_CASE) case ogs_type_n: break; }
-
-#define SWITCH_OP_CASE(T,OP) case ogs_##OP: { WITH_OP(T,OP); break; }
-#define SWITCH_OP(T,op) switch(op) { \
-    OGS_FOR_EACH_OP(T,SWITCH_OP_CASE) case ogs_op_n: break; }
-
-
-void hostGatherKernel(const dlong N,
-                      const int Nentries,
-                      const int Nvectors,
-                      const dlong stride,
-                      const dlong gstride,
-                      const dlong *gatherStarts,
-                      const dlong *gatherIds,
-                      const ogs_type type,
-                      const ogs_op op,
-                      const void *v,
-                      void *gv) {
-
-#define WITH_OP(T,OP)                         \
-  hostGatherKernel_##T##_##OP(N,              \
-                              Nentries,       \
-                              Nvectors,       \
-                              stride,         \
-                              gstride,        \
-                              gatherStarts,   \
-                              gatherIds,      \
-                              (T*) v,         \
-                              (T*) gv);
-#define WITH_TYPE(T) SWITCH_OP(T,op)
-
-  SWITCH_TYPE(type)
-
-#undef  WITH_TYPE
-#undef  WITH_OP
-}
-
-} //namespace ogs
\ No newline at end of file
diff --git a/libs/ogs/hostGatherScatter.cpp b/libs/ogs/hostGatherScatter.cpp
deleted file mode 100644
index fb76885da..000000000
--- a/libs/ogs/hostGatherScatter.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "ogs.hpp"
-#include "ogs/ogsKernels.hpp"
-
-namespace ogs {
-
-OGS_DEFINE_TYPE_SIZES()
-OGS_FOR_EACH_TYPE(DEFINE_ADD_OGS_INIT)
-
-void hostGatherScatter(void* v,
-                       const int Nentries,
-                       const int Nvectors,
-                       const dlong stride,
-                       const ogs_type type,
-                       const ogs_op op,
-                       const ogs_transpose trans,
-                       ogs_t &ogs){
-
-  const size_t Nbytes = ogs_type_size[type];
-
-  ogs.reallocHostBuffer(Nbytes*Nentries*Nvectors);
-
-  dlong NhaloGather  = (trans == ogs_notrans) ? ogs.haloGather.Nrows : ogs.haloScatter.Nrows;
-  dlong NhaloScatter = (trans == ogs_trans)   ? ogs.haloGather.Nrows : ogs.haloScatter.Nrows;
-
-  void* gsh = (trans == ogs_sym) ? ogs.gshSym : ogs.gsh;
-
-  // gather-scatter halo nodes
-  if (NhaloGather) {
-    if (trans == ogs_notrans)
-      hostGatherKernel(ogs.haloGather.Nrows, Nentries, Nvectors, stride, ogs.Nhalo,
-                       ogs.haloGather.rowStarts, ogs.haloGather.colIds,
-                       type, op, v, ogs.hostBuf);
-    else
-      hostGatherKernel(ogs.haloScatter.Nrows, Nentries, Nvectors, stride, ogs.Nhalo,
-                       ogs.haloScatter.rowStarts, ogs.haloScatter.colIds,
-                       type, op, v, ogs.hostBuf);
-  }
-
-  // MPI based gather scatter using libgs
-  gsGatherScatter(ogs.hostBuf, Nentries, Nvectors, ogs.Nhalo,
-                  type, op, trans, gsh);
-
-  if (NhaloScatter) {
-    if (trans == ogs_trans)
-      hostScatterKernel(ogs.haloGather.Nrows, Nentries, Nvectors, stride, ogs.Nhalo,
-                        ogs.haloGather.rowStarts, ogs.haloGather.colIds,
-                        type, op, ogs.hostBuf, v);
-    else
-      hostScatterKernel(ogs.haloScatter.Nrows, Nentries, Nvectors, stride, ogs.Nhalo,
-                        ogs.haloScatter.rowStarts, ogs.haloScatter.colIds,
-                        type, op, ogs.hostBuf, v);
-  }
-
-  if (trans == ogs_notrans) {
-    if (ogs.fusedScatter.Nrows)
-      hostGatherScatterKernel(ogs.fusedScatter.Nrows, Nentries, Nvectors, stride,
-                              ogs.fusedGather.rowStarts,  ogs.fusedGather.colIds,
-                              ogs.fusedScatter.rowStarts, ogs.fusedScatter.colIds,
-                              type, op, v);
-  } else if (trans == ogs_trans) {
-    if (ogs.fusedScatter.Nrows)
-      hostGatherScatterKernel(ogs.fusedScatter.Nrows, Nentries, Nvectors, stride,
-                              ogs.fusedScatter.rowStarts, ogs.fusedScatter.colIds,
-                              ogs.fusedGather.rowStarts,  ogs.fusedGather.colIds,
-                              type, op, v);
-  } else { //ogs_sym
-    if (ogs.symGatherScatter.Nrows)
-      hostGatherScatterKernel(ogs.symGatherScatter.Nrows, Nentries, Nvectors, stride,
-                              ogs.symGatherScatter.rowStarts, ogs.symGatherScatter.colIds,
-                              ogs.symGatherScatter.rowStarts, ogs.symGatherScatter.colIds,
-                              type, op, v);
-  }
-}
-
-/*------------------------------------------------------------------------------
-  The basic gatherScatter kernel
-------------------------------------------------------------------------------*/
-#define DEFINE_GATHERSCATTER(T,OP)                                              \
-static void hostGatherScatterKernel_##T##_##OP(const dlong N,                   \
-                                               const int   Nentries,            \
-                                               const int   Nvectors,            \
-                                               const dlong stride,              \
-                                               const dlong *gatherStarts,       \
-                                               const dlong *gatherIds,          \
-                                               const dlong *scatterStarts,      \
-                                               const dlong *scatterIds,         \
-                                                         T *q)                  \
-{                                                                               \
-  for(dlong n=0;n<N*Nentries*Nvectors;++n){                                     \
-    const int m     = n/(N*Nentries);                                           \
-    const dlong vid = n%(N*Nentries);                                           \
-    const dlong gid = vid/Nentries;                                             \
-    const int k     = vid%Nentries;                                             \
-    const dlong gstart = gatherStarts[gid];                                     \
-    const dlong gend = gatherStarts[gid+1];                                     \
-    T gq = init_##T##_##OP;                                                     \
-    for(dlong g=gstart;g<gend;++g){                                             \
-      const dlong id = gatherIds[g];                                            \
-      OGS_DO_##OP(gq,q[k+id*Nentries+m*stride]);                                \
-    }                                                                           \
-    const dlong sstart = scatterStarts[gid];                                    \
-    const dlong send = scatterStarts[gid+1];                                    \
-    for(dlong g=sstart;g<send;++g){                                             \
-      const dlong id = gatherIds[g];                                            \
-      q[k+id*Nentries+m*stride] = gq;                                           \
-    }                                                                           \
-  }                                                                             \
-}
-
-#define DEFINE_PROCS(T) \
-  OGS_FOR_EACH_OP(T,DEFINE_GATHERSCATTER)
-
-OGS_FOR_EACH_TYPE(DEFINE_PROCS)
-
-#define SWITCH_TYPE_CASE(T) case ogs_##T: { WITH_TYPE(T); break; }
-#define SWITCH_TYPE(type) switch(type) { \
-    OGS_FOR_EACH_TYPE(SWITCH_TYPE_CASE) case ogs_type_n: break; }
-
-#define SWITCH_OP_CASE(T,OP) case ogs_##OP: { WITH_OP(T,OP); break; }
-#define SWITCH_OP(T,op) switch(op) { \
-    OGS_FOR_EACH_OP(T,SWITCH_OP_CASE) case ogs_op_n: break; }
-
-
-void hostGatherScatterKernel(const dlong N,
-                             const int Nentries,
-                             const int Nvectors,
-                             const dlong stride,
-                             dlong* gatherStarts,
-                             dlong* gatherIds,
-                             dlong* scatterStarts,
-                             dlong* scatterIds,
-                             const ogs_type type,
-                             const ogs_op op,
-                             void* v) {
-
-#define WITH_OP(T,OP)                                \
-  hostGatherScatterKernel_##T##_##OP(N,              \
-                                     Nentries,       \
-                                     Nvectors,       \
-                                     stride,         \
-                                     gatherStarts,   \
-                                     gatherIds,      \
-                                     scatterStarts,  \
-                                     scatterIds,     \
-                                     (T*)v);
-#define WITH_TYPE(T) SWITCH_OP(T,op)
-
-  SWITCH_TYPE(type)
-
-#undef  WITH_TYPE
-#undef  WITH_OP
-}
-
-} //namespace ogs
\ No newline at end of file
diff --git a/libs/ogs/hostScatter.cpp b/libs/ogs/hostScatter.cpp
deleted file mode 100644
index 82ee67f7f..000000000
--- a/libs/ogs/hostScatter.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "ogs.hpp"
-#include "ogs/ogsKernels.hpp"
-
-namespace ogs {
-
-OGS_DEFINE_TYPE_SIZES()
-
-void hostScatter(void* v,
-                 void* gv,
-                 const int Nentries,
-                 const int Nvectors,
-                 const dlong stride,
-                 const dlong gstride,
-                 const ogs_type type,
-                 const ogs_op op,
-                 const ogs_transpose trans,
-                 ogs_t &ogs){
-
-  const size_t Nbytes = ogs_type_size[type];
-
-  ogs.reallocHostBuffer(Nbytes*Nentries*Nvectors);
-
-  if (trans == ogs_sym)
-    LIBP_ABORT(string("Calling ogs::Scatter in ogs_sym mode not supported."))
-
-  if (ogs.haloGather.Nrows)
-    for (int i=0;i<Nvectors;i++)
-      memcpy((char*)ogs.hostBuf+ogs.Nhalo*Nbytes*Nentries*i,
-             (char*)gv+ogs.localGather.Nrows*Nbytes + gstride*Nbytes*i,
-             ogs.haloGather.Nrows*Nbytes*Nentries);
-
-  // MPI based scatter using gslib
-  // (must use ogs_notrans so the negative ids don't contribute to op)
-  gsGatherScatter(ogs.hostBuf, Nentries, Nvectors, ogs.Nhalo,
-                  type, op, ogs_notrans, ogs.gsh);
-
-  dlong NhaloScatter = (trans == ogs_trans) ? ogs.haloGather.Nrows : ogs.haloScatter.Nrows;
-
-  if (NhaloScatter) {
-    if (trans == ogs_trans)
-      hostScatterKernel(ogs.haloGather.Nrows, Nentries, Nvectors, ogs.Nhalo, stride,
-                        ogs.haloGather.rowStarts, ogs.haloGather.colIds,
-                        type, op, ogs.hostBuf, v);
-    else
-      hostScatterKernel(ogs.haloScatter.Nrows, Nentries, Nvectors, ogs.Nhalo, stride,
-                        ogs.haloScatter.rowStarts, ogs.haloScatter.colIds,
-                        type, op, ogs.hostBuf, v);
-  }
-
-  // scatter interior nodes
-  if (ogs.Nlocal) {
-    if (trans == ogs_trans)
-      hostScatterKernel(ogs.localGather.Nrows, Nentries, Nvectors, gstride, stride,
-                        ogs.localGather.rowStarts, ogs.localGather.colIds,
-                        type, op, gv, v);
-    else
-      hostScatterKernel(ogs.localScatter.Nrows, Nentries, Nvectors, gstride, stride,
-                        ogs.localScatter.rowStarts, ogs.localScatter.colIds,
-                        type, op, gv, v);
-  }
-}
-
-/*------------------------------------------------------------------------------
-  The basic gather kernel
-------------------------------------------------------------------------------*/
-#define DEFINE_SCATTER(T)                                                       \
-static void hostScatterKernel_##T(const dlong N,                                \
-                                  const int   Nentries,                         \
-                                  const int   Nvectors,                         \
-                                  const dlong gstride,                          \
-                                  const dlong stride,                           \
-                                  const dlong *scatterStarts,                   \
-                                  const dlong *scatterIds,                      \
-                                  const     T *gatherq,                         \
-                                            T *q)                               \
-{                                                                               \
-  for(dlong n=0;n<N*Nentries*Nvectors;++n){                                     \
-    const int m     = n/(N*Nentries);                                           \
-    const dlong vid = n%(N*Nentries);                                           \
-    const dlong gid = vid/Nentries;                                             \
-    const int k     = vid%Nentries;                                             \
-    const T gq = gatherq[k+gid*Nentries+m*gstride];                             \
-    const dlong start = scatterStarts[gid];                                     \
-    const dlong end = scatterStarts[gid+1];                                     \
-    for(dlong g=start;g<end;++g){                                               \
-      const dlong id = scatterIds[g];                                           \
-      q[k+id*Nentries+m*stride] = gq;                                           \
-    }                                                                           \
-  }                                                                             \
-}
-
-#define DEFINE_PROCS(T) \
-  DEFINE_SCATTER(T)
-
-OGS_FOR_EACH_TYPE(DEFINE_PROCS)
-
-#define SWITCH_TYPE_CASE(T) case ogs_##T: { WITH_TYPE(T); break; }
-#define SWITCH_TYPE(type) switch(type) { \
-    OGS_FOR_EACH_TYPE(SWITCH_TYPE_CASE) case ogs_type_n: break; }
-
-void hostScatterKernel(const dlong N,
-                       const int Nentries,
-                       const int Nvectors,
-                       const dlong gstride,
-                       const dlong stride,
-                       const dlong *scatterStarts,
-                       const dlong *scatterIds,
-                       const ogs_type type,
-                       const ogs_op op,
-                       const void *gv,
-                       void *v) {
-
-#define WITH_TYPE(T)                          \
-  hostScatterKernel_##T(N,                    \
-                        Nentries,             \
-                        Nvectors,             \
-                        gstride,              \
-                        stride,               \
-                        scatterStarts,        \
-                        scatterIds,           \
-                        (T*) gv,              \
-                        (T*) v);
-
-  SWITCH_TYPE(type)
-
-#undef  WITH_TYPE
-}
-
-} //namespace ogs
\ No newline at end of file
diff --git a/libs/ogs/occaGather.cpp b/libs/ogs/occaGather.cpp
deleted file mode 100644
index 9c4851257..000000000
--- a/libs/ogs/occaGather.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "ogs.hpp"
-#include "ogs/ogsKernels.hpp"
-
-namespace ogs {
-
-OGS_DEFINE_TYPE_SIZES()
-
-void occaGatherStart(occa::memory& o_gv,
-                     occa::memory& o_v,
-                     const int Nentries,
-                     const int Nvectors,
-                     const dlong gstride,
-                     const dlong stride,
-                     const ogs_type type,
-                     const ogs_op op,
-                     const ogs_transpose trans,
-                     ogs_t &ogs){
-
-  occa::device &device = ogs.platform.device;
-  const size_t Nbytes = ogs_type_size[type];
-
-  if (trans == ogs_sym)
-    LIBP_ABORT(string("Calling ogs::Gather in ogs_sym mode not supported."))
-
-  ogs.reallocOccaBuffer(Nbytes*Nentries*Nvectors);
-
-  dlong NhaloGather = (trans == ogs_notrans) ? ogs.haloGather.Nrows : ogs.haloScatter.Nrows;
-
-  // gather halo nodes on device
-  if (NhaloGather) {
-    if (trans == ogs_notrans)
-      occaGatherKernel(ogs.haloGather, Nentries, Nvectors, stride, ogs.Nhalo,
-                       type, op, o_v, ogs.o_haloBuf);
-    else
-      occaGatherKernel(ogs.haloScatter, Nentries, Nvectors, stride, ogs.Nhalo,
-                       type, op, o_v, ogs.o_haloBuf);
-
-    device.finish();
-    occa::stream currentStream = device.getStream();
-    device.setStream(dataStream);
-
-    for (int i=0;i<Nvectors;i++)
-      ogs.o_haloBuf.copyTo((char*)ogs.haloBuf + ogs.Nhalo*Nbytes*Nentries*i,
-                           NhaloGather*Nbytes*Nentries,
-                           ogs.Nhalo*Nbytes*Nentries*i,
-                           "async: true");
-
-    device.setStream(currentStream);
-  }
-}
-
-
-void occaGatherFinish(occa::memory& o_gv,
-                      occa::memory& o_v,
-                      const int Nentries,
-                      const int Nvectors,
-                      const dlong gstride,
-                      const dlong stride,
-                      const ogs_type type,
-                      const ogs_op op,
-                      const ogs_transpose trans,
-                      ogs_t &ogs){
-
-  occa::device &device = ogs.platform.device;
-  const size_t Nbytes = ogs_type_size[type];
-
-  if (trans == ogs_sym)
-    LIBP_ABORT(string("Calling ogs::Gather in ogs_sym mode not supported."))
-
-  if(ogs.Nlocal) {
-    if (trans == ogs_notrans)
-      occaGatherKernel(ogs.localGather, Nentries, Nvectors, stride, gstride,
-                       type, op, o_v, o_gv);
-    else
-      occaGatherKernel(ogs.localScatter, Nentries, Nvectors, stride, gstride,
-                       type, op, o_v, o_gv);
-  }
-
-  occa::stream currentStream = device.getStream();
-  if (ogs.Nhalo) {
-    device.setStream(dataStream);
-    device.finish();
-    device.setStream(currentStream);
-  }
-
-  // MPI based gather using libgs
-  gsGatherScatter(ogs.haloBuf, Nentries, Nvectors, ogs.Nhalo,
-                  type, op, trans, ogs.gsh);
-
-  // copy totally gathered halo data back from HOST to DEVICE
-  if (ogs.haloGather.Nrows) {
-    device.setStream(dataStream);
-
-    for (int i=0;i<Nvectors;i++)
-      o_gv.copyFrom((char*)ogs.haloBuf+ogs.Nhalo*Nbytes*Nentries*i,
-                    ogs.haloGather.Nrows*Nbytes*Nentries,
-                    ogs.localGather.Nrows*Nbytes*Nentries + gstride*Nbytes*i,
-                    "async: true");
-
-    device.finish();
-    device.setStream(currentStream);
-  }
-}
-
-
-#define SWITCH_TYPE_CASE(T) case ogs_##T: { WITH_TYPE(T); break; }
-#define SWITCH_TYPE(type) switch(type) { \
-    OGS_FOR_EACH_TYPE(SWITCH_TYPE_CASE) case ogs_type_n: break; }
-
-#define SWITCH_OP_CASE(T,OP) case ogs_##OP: { WITH_OP(T,OP); break; }
-#define SWITCH_OP(T,op) switch(op) { \
-    OGS_FOR_EACH_OP(T,SWITCH_OP_CASE) case ogs_op_n: break; }
-
-
-void occaGatherKernel(const ogsData_t &gather,
-                      const int Nentries,
-                      const int Nvectors,
-                      const dlong stride,
-                      const dlong gstride,
-                      const ogs_type type,
-                      const ogs_op op,
-                      occa::memory& o_v,
-                      occa::memory& o_gv) {
-
-#define WITH_OP(T,OP)                              \
-  gatherKernel_##T##_##OP(gather.NrowBlocks,       \
-                          Nentries,                \
-                          Nvectors,                \
-                          stride,                  \
-                          gstride,                 \
-                          gather.o_blockRowStarts, \
-                          gather.o_rowStarts,      \
-                          gather.o_colIds,         \
-                          o_v,                     \
-                          o_gv);
-#define WITH_TYPE(T) SWITCH_OP(T,op)
-
-  SWITCH_TYPE(type)
-
-#undef  WITH_TYPE
-#undef  WITH_OP
-}
-
-} //namespace ogs
\ No newline at end of file
diff --git a/libs/ogs/occaGatherScatter.cpp b/libs/ogs/occaGatherScatter.cpp
deleted file mode 100644
index e90c4837c..000000000
--- a/libs/ogs/occaGatherScatter.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "ogs.hpp"
-#include "ogs/ogsKernels.hpp"
-
-namespace ogs {
-
-OGS_DEFINE_TYPE_SIZES()
-
-void occaGatherScatterStart(occa::memory& o_v,
-                            const int Nentries,
-                            const int Nvectors,
-                            const dlong stride,
-                            const ogs_type type,
-                            const ogs_op op,
-                            const ogs_transpose trans,
-                            ogs_t &ogs){
-
-  occa::device &device = ogs.platform.device;
-  const size_t Nbytes = ogs_type_size[type];
-
-  ogs.reallocOccaBuffer(Nbytes*Nentries*Nvectors);
-
-  dlong NhaloGather = (trans == ogs_notrans) ? ogs.haloGather.Nrows : ogs.haloScatter.Nrows;
-
-  // gather halo nodes on device
-  if (NhaloGather) {
-    if (trans == ogs_notrans)
-      occaGatherKernel(ogs.haloGather, Nentries, Nvectors, stride, ogs.Nhalo,
-                       type, op, o_v, ogs.o_haloBuf);
-    else
-      occaGatherKernel(ogs.haloScatter, Nentries, Nvectors, stride, ogs.Nhalo,
-                       type, op, o_v, ogs.o_haloBuf);
-
-    device.finish();
-    occa::stream currentStream = device.getStream();
-    device.setStream(dataStream);
-
-    for (int i=0;i<Nvectors;i++)
-      ogs.o_haloBuf.copyTo((char*)ogs.haloBuf + ogs.Nhalo*Nbytes*Nentries*i,
-                           NhaloGather*Nbytes*Nentries,
-                           ogs.Nhalo*Nbytes*Nentries*i,
-                           "async: true");
-
-    device.setStream(currentStream);
-  }
-}
-
-
-void occaGatherScatterFinish(occa::memory& o_v,
-                             const int Nentries,
-                             const int Nvectors,
-                             const dlong stride,
-                             const ogs_type type,
-                             const ogs_op op,
-                             const ogs_transpose trans,
-                             ogs_t &ogs){
-
-  occa::device &device = ogs.platform.device;
-  const size_t Nbytes = ogs_type_size[type];
-
-  void* gsh = (trans == ogs_sym) ? ogs.gshSym : ogs.gsh;
-
-  if (trans == ogs_notrans) {
-    if(ogs.fusedScatter.Nrows)
-      occaGatherScatterKernel(ogs.fusedGather, ogs.fusedScatter,
-                              Nentries, Nvectors, stride, type, op, o_v);
-  } else if (trans == ogs_trans) {
-    if(ogs.fusedScatter.Nrows)
-      occaGatherScatterKernel(ogs.fusedScatter, ogs.fusedGather,
-                              Nentries, Nvectors, stride, type, op, o_v);
-  } else {//ogs_sym
-    if(ogs.symGatherScatter.Nrows)
-      occaGatherScatterKernel(ogs.symGatherScatter, ogs.symGatherScatter,
-                              Nentries, Nvectors, stride, type, op, o_v);
-  }
-
-  occa::stream currentStream = device.getStream();
-  if (ogs.Nhalo) {
-    device.setStream(dataStream);
-    device.finish();
-    device.setStream(currentStream);
-  }
-
-  // MPI based gather scatter using libgs
-  gsGatherScatter(ogs.haloBuf, Nentries, Nvectors, ogs.Nhalo,
-                  type, op, trans, gsh);
-
-  dlong NhaloScatter = (trans == ogs_trans) ? ogs.haloGather.Nrows : ogs.haloScatter.Nrows;
-
-  if (NhaloScatter) {
-    device.setStream(dataStream);
-
-    // copy gatherScattered halo data back from HOST to DEVICE
-    for (int i=0;i<Nvectors;i++)
-      ogs.o_haloBuf.copyFrom((char*)ogs.haloBuf + ogs.Nhalo*Nbytes*Nentries*i,
-                             NhaloScatter*Nbytes*Nentries,
-                             ogs.Nhalo*Nbytes*Nentries*i,
-                             "async: true");
-
-    device.finish();
-    device.setStream(currentStream);
-
-    // scatter back to local nodes
-    if (trans == ogs_trans)
-      occaScatterKernel(ogs.haloGather, Nentries, Nvectors, ogs.Nhalo, stride,
-                        type, op, ogs.o_haloBuf, o_v);
-    else
-      occaScatterKernel(ogs.haloScatter, Nentries, Nvectors, ogs.Nhalo, stride,
-                        type, op, ogs.o_haloBuf, o_v);
-  }
-}
-
-
-#define SWITCH_TYPE_CASE(T) case ogs_##T: { WITH_TYPE(T); break; }
-#define SWITCH_TYPE(type) switch(type) { \
-    OGS_FOR_EACH_TYPE(SWITCH_TYPE_CASE) case ogs_type_n: break; }
-
-#define SWITCH_OP_CASE(T,OP) case ogs_##OP: { WITH_OP(T,OP); break; }
-#define SWITCH_OP(T,op) switch(op) { \
-    OGS_FOR_EACH_OP(T,SWITCH_OP_CASE) case ogs_op_n: break; }
-
-
-void occaGatherScatterKernel(const ogsData_t &gather,
-                             const ogsData_t &scatter,
-                             const int Nentries,
-                             const int Nvectors,
-                             const dlong stride,
-                             const ogs_type type,
-                             const ogs_op op,
-                             occa::memory&  o_v) {
-
-#define WITH_OP(T,OP)                                     \
-  gatherScatterKernel_##T##_##OP(gather.NrowBlocks,       \
-                                 Nentries,                \
-                                 Nvectors,                \
-                                 stride,                  \
-                                 gather.o_blockRowStarts, \
-                                 gather.o_rowStarts,      \
-                                 gather.o_colIds,         \
-                                 scatter.o_rowStarts,     \
-                                 scatter.o_colIds,        \
-                                 o_v);
-#define WITH_TYPE(T) SWITCH_OP(T,op)
-
-  SWITCH_TYPE(type)
-
-#undef  WITH_TYPE
-#undef  WITH_OP
-}
-
-} //namespace ogs
diff --git a/libs/ogs/occaGatheredHaloExchange.cpp b/libs/ogs/occaGatheredHaloExchange.cpp
deleted file mode 100644
index d8a6e99ab..000000000
--- a/libs/ogs/occaGatheredHaloExchange.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "ogs.hpp"
-#include "ogs/ogsKernels.hpp"
-
-OGS_DEFINE_TYPE_SIZES()
-
-using namespace ogs;
-
-void ogs_t::GatheredHaloExchangeStart(occa::memory& o_v,
-                                  const int k,
-                                  const ogs_type type){
-
-  occa::device &device = platform.device;
-  const size_t Nbytes = ogs_type_size[type];
-
-  reallocOccaBuffer(Nbytes*k);
-
-  if (haloGather.Nrows) {
-    occa::stream currentStream = device.getStream();
-    device.finish(); //make sure data is ready to copy
-    device.setStream(dataStream);
-
-    o_v.copyTo((char*)haloBuf,
-                haloGather.Nrows*Nbytes*k,
-                localGather.Nrows*Nbytes*k,
-                "async: true");
-
-    device.setStream(currentStream);
-  }
-}
-
-
-void ogs_t::GatheredHaloExchangeFinish(occa::memory& o_v,
-                                     const int k,
-                                     const ogs_type type){
-
-  occa::device &device = platform.device;
-  const size_t Nbytes = ogs_type_size[type];
-
-  occa::stream currentStream = device.getStream();
-  if (Nhalo) {
-    device.setStream(dataStream);
-    device.finish();
-    device.setStream(currentStream);
-  }
-
-  // MPI based scatter using gslib
-  // (must use ogs_notrans so the negative ids don't contribute to op)
-  gsGatherScatter(haloBuf, k, 1, Nhalo,
-                  type, ogs_add, ogs_notrans, gsh);
-
-  if (haloScatter.Nrows) {
-    device.setStream(dataStream);
-
-    // copy totally scattered halo data back from HOST to DEVICE
-    o_v.copyFrom((char*)haloBuf+haloGather.Nrows*Nbytes*k,
-                 (Nhalo-haloGather.Nrows)*Nbytes*k,
-                 Ngather*Nbytes*k,
-                 "async: true");
-
-    device.finish();
-    device.setStream(currentStream);
-  }
-}
-
-/* Build global to local mapping */
-void ogs_t::GatheredHaloExchangeSetup(){
-  dlong *ids = (dlong*) malloc((Ngather+NgatherHalo)*sizeof(dlong));
-
-  for (dlong n=0;n<Ngather+NgatherHalo;n++)
-    ids[n] = n;
-
-  GlobalToLocal = (dlong*) malloc(N*sizeof(dlong));
-
-  for (dlong n=0;n<N;n++)
-    GlobalToLocal[n] = -1;
-
-  for (dlong i=0;i<localScatter.Nrows;i++) {
-    const dlong start = localScatter.rowStarts[i];
-    const dlong end   = localScatter.rowStarts[i+1];
-    for (dlong j=start;j<end;j++) {
-      const dlong colId = localScatter.colIds[j];
-      GlobalToLocal[colId] = ids[i];
-    }
-  }
-  for (dlong i=0;i<haloScatter.Nrows;i++) {
-    const dlong start = haloScatter.rowStarts[i];
-    const dlong end   = haloScatter.rowStarts[i+1];
-    for (dlong j=start;j<end;j++) {
-      const dlong colId = haloScatter.colIds[j];
-      GlobalToLocal[colId] = ids[i+localScatter.Nrows];
-    }
-  }
-
-  free(ids);
-
-  o_GlobalToLocal = platform.malloc(N*sizeof(dlong), GlobalToLocal);
-}
\ No newline at end of file
diff --git a/libs/ogs/occaScatter.cpp b/libs/ogs/occaScatter.cpp
deleted file mode 100644
index 3f4733a3f..000000000
--- a/libs/ogs/occaScatter.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "ogs.hpp"
-#include "ogs/ogsKernels.hpp"
-
-namespace ogs {
-
-OGS_DEFINE_TYPE_SIZES()
-
-void occaScatterStart(occa::memory& o_v,
-                      occa::memory& o_gv,
-                      const int Nentries,
-                      const int Nvectors,
-                      const dlong stride,
-                      const dlong gstride,
-                      const ogs_type type,
-                      const ogs_op op,
-                      const ogs_transpose trans,
-                      ogs_t &ogs){
-
-  occa::device &device = ogs.platform.device;
-  const size_t Nbytes = ogs_type_size[type];
-
-  if (trans == ogs_sym)
-    LIBP_ABORT(string("Calling ogs::Scatter in ogs_sym mode not supported."))
-
-  ogs.reallocOccaBuffer(Nbytes*Nentries*Nvectors);
-
-  if (ogs.haloGather.Nrows) {
-    occa::stream currentStream = device.getStream();
-    device.finish(); //make sure its safe to start the transfer
-    device.setStream(dataStream);
-
-    for (int i=0;i<Nvectors;i++)
-      o_gv.copyTo((char*)ogs.haloBuf+ogs.Nhalo*Nbytes*Nentries*i,
-                  ogs.haloGather.Nrows*Nbytes*Nentries,
-                  ogs.localGather.Nrows*Nbytes*Nentries + gstride*Nbytes*i,
-                  "async: true");
-
-    device.setStream(currentStream);
-  }
-}
-
-
-void occaScatterFinish(occa::memory& o_v,
-                       occa::memory& o_gv,
-                       const int Nentries,
-                       const int Nvectors,
-                       const dlong stride,
-                       const dlong gstride,
-                       const ogs_type type,
-                       const ogs_op op,
-                       const ogs_transpose trans,
-                       ogs_t &ogs){
-
-  occa::device &device = ogs.platform.device;
-  const size_t Nbytes = ogs_type_size[type];
-
-  if (trans == ogs_sym)
-    LIBP_ABORT(string("Calling ogs::Scatter in ogs_sym mode not supported."))
-
-  dlong NlocalScatter = (trans == ogs_notrans) ? ogs.localScatter.Nrows : ogs.localGather.Nrows;
-
-  if(NlocalScatter) {
-    if (trans == ogs_notrans)
-      occaScatterKernel(ogs.localScatter, Nentries, Nvectors, gstride, stride,
-                        type, op, o_gv, o_v);
-    else
-      occaScatterKernel(ogs.localGather, Nentries, Nvectors, gstride, stride,
-                        type, op, o_gv, o_v);
-  }
-
-  occa::stream currentStream = device.getStream();
-  if (ogs.Nhalo) {
-    device.setStream(dataStream);
-    device.finish();
-    device.setStream(currentStream);
-  }
-
-  // MPI based scatter using gslib
-  // (must use ogs_notrans so the negative ids don't contribute to op)
-  gsGatherScatter(ogs.haloBuf, Nentries, Nvectors, ogs.Nhalo,
-                  type, op, ogs_notrans, ogs.gsh);
-
-  dlong NhaloScatter = (trans == ogs_notrans) ? ogs.haloScatter.Nrows : ogs.haloGather.Nrows;
-
-  if (NhaloScatter) {
-    device.setStream(dataStream);
-
-    // copy totally scattered halo data back from HOST to DEVICE
-    for (int i=0;i<Nvectors;i++)
-      ogs.o_haloBuf.copyFrom((char*)ogs.haloBuf + ogs.Nhalo*Nbytes*Nentries*i,
-                             NhaloScatter*Nbytes*Nentries,
-                             ogs.Nhalo*Nbytes*Nentries*i,
-                             "async: true");
-
-    device.finish();
-    device.setStream(currentStream);
-
-    if (trans == ogs_notrans)
-      occaScatterKernel(ogs.haloScatter, Nentries, Nvectors, ogs.Nhalo, stride,
-                        type, op, ogs.o_haloBuf, o_v);
-    else
-      occaScatterKernel(ogs.haloGather, Nentries, Nvectors, ogs.Nhalo, stride,
-                        type, op, ogs.o_haloBuf, o_v);
-  }
-}
-
-
-#define SWITCH_TYPE_CASE(T) case ogs_##T: { WITH_TYPE(T); break; }
-#define SWITCH_TYPE(type) switch(type) { \
-    OGS_FOR_EACH_TYPE(SWITCH_TYPE_CASE) case ogs_type_n: break; }
-
-void occaScatterKernel(const ogsData_t &scatter,
-                       const int Nentries,
-                       const int Nvectors,
-                       const dlong gstride,
-                       const dlong stride,
-                       const ogs_type type,
-                       const ogs_op op,
-                       occa::memory& o_gv,
-                       occa::memory& o_v) {
-
-#define WITH_TYPE(T)                          \
-  scatterKernel_##T(scatter.NrowBlocks,       \
-                    Nentries,                 \
-                    Nvectors,                 \
-                    gstride,                  \
-                    stride,                   \
-                    scatter.o_blockRowStarts, \
-                    scatter.o_rowStarts,      \
-                    scatter.o_colIds,         \
-                    o_gv,                     \
-                    o_v);
-
-  SWITCH_TYPE(type)
-
-#undef  WITH_TYPE
-}
-
-} //namespace ogs
\ No newline at end of file
diff --git a/libs/ogs/ogs.cpp b/libs/ogs/ogs.cpp
index ba1b0c24a..50e71e912 100644
--- a/libs/ogs/ogs.cpp
+++ b/libs/ogs/ogs.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,183 +25,553 @@ SOFTWARE.
 */
 
 #include "ogs.hpp"
-#include "ogs/ogsKernels.hpp"
+#include "ogs/ogsUtils.hpp"
+#include "ogs/ogsOperator.hpp"
+#include "ogs/ogsExchange.hpp"
+
+namespace libp {
+
+namespace ogs {
+
+/********************************
+ * Device GatherScatter
+ ********************************/
+template<typename T>
+void ogs_t::GatherScatter(deviceMemory<T> o_v,
+                          const int k,
+                          const Op op,
+                          const Transpose trans){
+  GatherScatterStart (o_v, k, op, trans);
+  GatherScatterFinish(o_v, k, op, trans);
+}
+
+template<typename T>
+void ogs_t::GatherScatterStart(deviceMemory<T> o_v,
+                               const int k,
+                               const Op op,
+                               const Transpose trans){
+  exchange->AllocBuffer(k*sizeof(T));
+
+  deviceMemory<T> o_haloBuf = exchange->o_workspace;
 
-// Host buffer versions
-void ogs_t::GatherScatter    (void  *v,
-                              const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::hostGatherScatter(v, 1, 1, 0, type, op, trans, *this); }
+  //collect halo buffer
+  gatherHalo->Gather(o_haloBuf, o_v, k, op, trans);
 
-void ogs_t::GatherScatterVec (void  *v, const int k,
-                              const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::hostGatherScatter(v, k, 1, 0, type, op, trans, *this); }
+  if (exchange->gpu_aware) {
+    //prepare MPI exchange
+    exchange->Start(o_haloBuf, k, op, trans);
+  } else {
+    //get current stream
+    device_t &device = platform.device;
+    stream_t currentStream = device.getStream();
 
-void ogs_t::GatherScatterMany(void  *v, const int k, const dlong stride,
-                              const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::hostGatherScatter(v, 1, k, stride, type, op, trans, *this); }
+    pinnedMemory<T> haloBuf = exchange->h_workspace;
 
-void ogs_t::Gather    (void  *gv, void  *v,
-                       const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::hostGather(gv, v, 1, 1, 0, 0, type, op, trans, *this); }
+    //if not using gpu-aware mpi move the halo buffer to the host
+    const dlong Nhalo = (trans == NoTrans) ? NhaloP : NhaloT;
 
-void ogs_t::GatherVec (void  *gv, void  *v, const int k,
-                       const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::hostGather(gv, v, k, 1, 0, 0, type, op, trans, *this); }
+    //wait for o_haloBuf to be ready
+    device.finish();
+
+    //queue copy to host
+    device.setStream(dataStream);
+    haloBuf.copyFrom(o_haloBuf, Nhalo*k,
+                     0, properties_t("async", true));
+    device.setStream(currentStream);
+  }
+}
 
-void ogs_t::GatherMany(void  *gv, void  *v, const int k,
-                       const dlong gstride, const dlong stride,
-                       const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::hostGather(gv, v, 1, k, gstride, stride, type, op, trans, *this); }
+template<typename T>
+void ogs_t::GatherScatterFinish(deviceMemory<T> o_v,
+                                const int k,
+                                const Op op,
+                                const Transpose trans){
+
+  //queue local gs operation
+  gatherLocal->GatherScatter(o_v, k, op, trans);
+
+  deviceMemory<T> o_haloBuf = exchange->o_workspace;
+
+  if (exchange->gpu_aware) {
+    //finish MPI exchange
+    exchange->Finish(o_haloBuf, k, op, trans);
+  } else {
+    pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+    //get current stream
+    device_t &device = platform.device;
+    stream_t currentStream = device.getStream();
+
+    //synchronize data stream to ensure the buffer is on the host
+    device.setStream(dataStream);
+    device.finish();
+
+    /*MPI exchange of host buffer*/
+    exchange->Start (haloBuf, k, op, trans);
+    exchange->Finish(haloBuf, k, op, trans);
+
+    // copy recv back to device
+    const dlong Nhalo = (trans == Trans) ? NhaloP : NhaloT;
+    haloBuf.copyTo(o_haloBuf, Nhalo*k,
+                   0, properties_t("async", true));
+    device.finish(); //wait for transfer to finish
+    device.setStream(currentStream);
+  }
+
+  //write exchanged halo buffer back to vector
+  gatherHalo->Scatter(o_v, o_haloBuf, k, trans);
+}
 
-void ogs_t::Scatter    (void  *v, void  *gv,
-                        const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::hostScatter(v, gv, 1, 1, 0, 0, type, op, trans, *this); }
+template
+void ogs_t::GatherScatter(deviceMemory<float> v, const int k,
+                          const Op op, const Transpose trans);
+template
+void ogs_t::GatherScatter(deviceMemory<double> v, const int k,
+                          const Op op, const Transpose trans);
+template
+void ogs_t::GatherScatter(deviceMemory<int> v, const int k,
+                          const Op op, const Transpose trans);
+template
+void ogs_t::GatherScatter(deviceMemory<long long int> v, const int k,
+                          const Op op, const Transpose trans);
+
+/********************************
+ * Host GatherScatter
+ ********************************/
+template<typename T>
+void ogs_t::GatherScatter(memory<T> v,
+                          const int k,
+                          const Op op,
+                          const Transpose trans){
+  GatherScatterStart (v, k, op, trans);
+  GatherScatterFinish(v, k, op, trans);
+}
 
-void ogs_t::ScatterVec (void  *v, void  *gv, const int k,
-                        const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::hostScatter(v, gv, k, 1, 0, 0, type, op, trans, *this); }
+template<typename T>
+void ogs_t::GatherScatterStart(memory<T> v,
+                               const int k,
+                               const Op op,
+                               const Transpose trans){
+  exchange->AllocBuffer(k*sizeof(T));
 
-void ogs_t::ScatterMany(void  *v, void  *gv, const int k,
-                        const dlong stride, const dlong gstride,
-                        const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::hostScatter(v, gv, 1, k, stride, gstride, type, op, trans, *this); }
+  /*Cast workspace to type T*/
+  pinnedMemory<T> haloBuf = exchange->h_workspace;
 
+  //collect halo buffer
+  gatherHalo->Gather(haloBuf, v, k, op, trans);
 
-// Synchronous device buffer versions
-void ogs_t::GatherScatter    (occa::memory&  o_v,
-                              const ogs_type type, const ogs_op op, const ogs_transpose trans) {
-  ogs::occaGatherScatterStart (o_v, 1, 1, 0, type, op, trans, *this);
-  ogs::occaGatherScatterFinish(o_v, 1, 1, 0, type, op, trans, *this);
+  //prepare MPI exchange
+  exchange->Start(haloBuf, k, op, trans);
 }
 
-void ogs_t::GatherScatterVec (occa::memory&  o_v, const int k,
-                              const ogs_type type, const ogs_op op, const ogs_transpose trans) {
-  ogs::occaGatherScatterStart (o_v, k, 1, 0, type, op, trans, *this);
-  ogs::occaGatherScatterFinish(o_v, k, 1, 0, type, op, trans, *this);
+template<typename T>
+void ogs_t::GatherScatterFinish(memory<T> v,
+                                const int k,
+                                const Op op,
+                                const Transpose trans){
+
+  /*Cast workspace to type T*/
+  pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+  //queue local gs operation
+  gatherLocal->GatherScatter(v, k, op, trans);
+
+  //finish MPI exchange
+  exchange->Finish(haloBuf, k, op, trans);
+
+  //write exchanged halo buffer back to vector
+  gatherHalo->Scatter(v, haloBuf, k, trans);
 }
 
-void ogs_t::GatherScatterMany(occa::memory&  o_v, const int k,
-                              const dlong stride,
-                              const ogs_type type, const ogs_op op, const ogs_transpose trans) {
-  ogs::occaGatherScatterStart (o_v, 1, k, stride, type, op, trans, *this);
-  ogs::occaGatherScatterFinish(o_v, 1, k, stride, type, op, trans, *this);
+template
+void ogs_t::GatherScatter(memory<float> v, const int k,
+                          const Op op, const Transpose trans);
+template
+void ogs_t::GatherScatter(memory<double> v, const int k,
+                          const Op op, const Transpose trans);
+template
+void ogs_t::GatherScatter(memory<int> v, const int k,
+                          const Op op, const Transpose trans);
+template
+void ogs_t::GatherScatter(memory<long long int> v, const int k,
+                          const Op op, const Transpose trans);
+
+/********************************
+ * Device Gather
+ ********************************/
+template<typename T>
+void ogs_t::Gather(deviceMemory<T> o_gv,
+                   deviceMemory<T> o_v,
+                   const int k,
+                   const Op op,
+                   const Transpose trans){
+  GatherStart (o_gv, o_v, k, op, trans);
+  GatherFinish(o_gv, o_v, k, op, trans);
 }
 
-void ogs_t::Gather    (occa::memory&  o_gv, occa::memory&  o_v,
-                       const ogs_type type, const ogs_op op, const ogs_transpose trans) {
-  ogs::occaGatherStart (o_gv, o_v, 1, 1, 0, 0, type, op, trans, *this);
-  ogs::occaGatherFinish(o_gv, o_v, 1, 1, 0, 0, type, op, trans, *this);
+template<typename T>
+void ogs_t::GatherStart(deviceMemory<T> o_gv,
+                        deviceMemory<T> o_v,
+                        const int k,
+                        const Op op,
+                        const Transpose trans){
+  AssertGatherDefined();
+
+  deviceMemory<T> o_haloBuf = exchange->o_workspace;
+
+  if (trans==Trans) { //if trans!=ogs::Trans theres no comms required
+    exchange->AllocBuffer(k*sizeof(T));
+
+    //collect halo buffer
+    gatherHalo->Gather(o_haloBuf, o_v, k, op, Trans);
+
+    if (exchange->gpu_aware) {
+      //prepare MPI exchange
+      exchange->Start(o_haloBuf, k, op, Trans);
+    } else {
+      //get current stream
+      device_t &device = platform.device;
+      stream_t currentStream = device.getStream();
+
+      //if not using gpu-aware mpi move the halo buffer to the host
+      pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+      //wait for o_haloBuf to be ready
+      device.finish();
+
+      //queue copy to host
+      device.setStream(dataStream);
+      haloBuf.copyFrom(o_haloBuf, NhaloT*k,
+                       0, properties_t("async", true));
+      device.setStream(currentStream);
+    }
+  } else {
+    //gather halo
+    gatherHalo->Gather(o_gv + k*NlocalT, o_v, k, op, trans);
+  }
 }
 
-void ogs_t::GatherVec (occa::memory&  o_gv, occa::memory&  o_v, const int k,
-                       const ogs_type type, const ogs_op op, const ogs_transpose trans) {
-  ogs::occaGatherStart (o_gv, o_v, k, 1, 0, 0, type, op, trans, *this);
-  ogs::occaGatherFinish(o_gv, o_v, k, 1, 0, 0, type, op, trans, *this);
+template<typename T>
+void ogs_t::GatherFinish(deviceMemory<T> o_gv,
+                         deviceMemory<T> o_v,
+                         const int k,
+                         const Op op,
+                         const Transpose trans){
+  AssertGatherDefined();
+
+  deviceMemory<T> o_haloBuf = exchange->o_workspace;
+
+  //queue local g operation
+  gatherLocal->Gather(o_gv, o_v, k, op, trans);
+
+  if (trans==Trans) { //if trans!=ogs::Trans theres no comms required
+    if (exchange->gpu_aware) {
+      //finish MPI exchange
+      exchange->Finish(o_haloBuf, k, op, Trans);
+
+      //put the result at the end of o_gv
+      o_haloBuf.copyTo(o_gv + k*NlocalT,
+                       k*NhaloP, 0, properties_t("async", true));
+    } else {
+      pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+      //get current stream
+      device_t &device = platform.device;
+      stream_t currentStream = device.getStream();
+
+      //synchronize data stream to ensure the buffer is on the host
+      device.setStream(dataStream);
+      device.finish();
+
+      /*MPI exchange of host buffer*/
+      exchange->Start (haloBuf, k, op, trans);
+      exchange->Finish(haloBuf, k, op, trans);
+
+      // copy recv back to device
+      //put the result at the end of o_gv
+      haloBuf.copyTo(o_gv + k*NlocalT, k*NhaloP,
+                     0, properties_t("async", true));
+      device.finish(); //wait for transfer to finish
+      device.setStream(currentStream);
+    }
+  }
 }
 
-void ogs_t::GatherMany(occa::memory&  o_gv, occa::memory&  o_v, const int k,
-                       const dlong gstride, const dlong stride,
-                       const ogs_type type, const ogs_op op, const ogs_transpose trans) {
-  ogs::occaGatherStart (o_gv, o_v, 1, k, gstride, stride, type, op, trans, *this);
-  ogs::occaGatherFinish(o_gv, o_v, 1, k, gstride, stride, type, op, trans, *this);
+template
+void ogs_t::Gather(deviceMemory<float> v, const deviceMemory<float> gv,
+                   const int k, const Op op, const Transpose trans);
+template
+void ogs_t::Gather(deviceMemory<double> v, const deviceMemory<double> gv,
+                   const int k, const Op op, const Transpose trans);
+template
+void ogs_t::Gather(deviceMemory<int> v, const deviceMemory<int> gv,
+                   const int k, const Op op, const Transpose trans);
+template
+void ogs_t::Gather(deviceMemory<long long int> v, const deviceMemory<long long int> gv,
+                   const int k, const Op op, const Transpose trans);
+
+/********************************
+ * Host Gather
+ ********************************/
+
+//host versions
+template<typename T>
+void ogs_t::Gather(memory<T> gv,
+                   const memory<T> v,
+                   const int k,
+                   const Op op,
+                   const Transpose trans){
+  GatherStart (gv, v, k, op, trans);
+  GatherFinish(gv, v, k, op, trans);
 }
 
-void ogs_t::Scatter    (occa::memory&  o_v, occa::memory&  o_gv,
-                        const ogs_type type, const ogs_op op, const ogs_transpose trans) {
-  ogs::occaScatterStart (o_v, o_gv, 1, 1, 0, 0, type, op, trans, *this);
-  ogs::occaScatterFinish(o_v, o_gv, 1, 1, 0, 0, type, op, trans, *this);
+template<typename T>
+void ogs_t::GatherStart(memory<T> gv,
+                        const memory<T> v,
+                        const int k,
+                        const Op op,
+                        const Transpose trans){
+  AssertGatherDefined();
+
+  if (trans==Trans) { //if trans!=ogs::Trans theres no comms required
+    exchange->AllocBuffer(k*sizeof(T));
+
+    /*Cast workspace to type T*/
+    pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+    //collect halo buffer
+    gatherHalo->Gather(haloBuf, v, k, op, Trans);
+
+    //prepare MPI exchange
+    exchange->Start(haloBuf, k, op, Trans);
+  } else {
+    //gather halo
+    gatherHalo->Gather(gv + k*NlocalT, v, k, op, trans);
+  }
 }
 
-void ogs_t::ScatterVec (occa::memory&  o_v, occa::memory&  o_gv, const int k,
-                        const ogs_type type, const ogs_op op, const ogs_transpose trans) {
-  ogs::occaScatterStart (o_v, o_gv, k, 1, 0, 0, type, op, trans, *this);
-  ogs::occaScatterFinish(o_v, o_gv, k, 1, 0, 0, type, op, trans, *this);
+template<typename T>
+void ogs_t::GatherFinish(memory<T> gv,
+                         const memory<T> v,
+                         const int k,
+                         const Op op,
+                         const Transpose trans){
+  AssertGatherDefined();
+
+  //queue local g operation
+  gatherLocal->Gather(gv, v, k, op, trans);
+
+  if (trans==Trans) { //if trans!=ogs::Trans theres no comms required
+    /*Cast workspace to type T*/
+    pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+    //finish MPI exchange
+    exchange->Finish(haloBuf, k, op, Trans);
+
+    //put the result at the end of o_gv
+    haloBuf.copyTo(gv+k*NlocalT, k*NhaloP);
+  }
 }
 
-void ogs_t::ScatterMany(occa::memory&  o_v, occa::memory&  o_gv, const int k,
-                        const dlong stride, const dlong gstride,
-                        const ogs_type type, const ogs_op op, const ogs_transpose trans) {
-  ogs::occaScatterStart (o_v, o_gv, 1, k, stride, gstride, type, op, trans, *this);
-  ogs::occaScatterFinish(o_v, o_gv, 1, k, stride, gstride, type, op, trans, *this);
+template
+void ogs_t::Gather(memory<float> v, const memory<float> gv,
+                   const int k, const Op op, const Transpose trans);
+template
+void ogs_t::Gather(memory<double> v, const memory<double> gv,
+                   const int k, const Op op, const Transpose trans);
+template
+void ogs_t::Gather(memory<int> v, const memory<int> gv,
+                   const int k, const Op op, const Transpose trans);
+template
+void ogs_t::Gather(memory<long long int> v, const memory<long long int> gv,
+                   const int k, const Op op, const Transpose trans);
+
+/********************************
+ * Device Scatter
+ ********************************/
+template<typename T>
+void ogs_t::Scatter(deviceMemory<T> o_v,
+                    deviceMemory<T> o_gv,
+                    const int k,
+                    const Transpose trans){
+  ScatterStart (o_v, o_gv, k, trans);
+  ScatterFinish(o_v, o_gv, k, trans);
 }
 
-// Asynchronous device buffer versions
-void ogs_t::GatherScatterStart     (occa::memory&  o_v,
-                                    const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::occaGatherScatterStart (o_v, 1, 1, 0, type, op, trans, *this); }
+template<typename T>
+void ogs_t::ScatterStart(deviceMemory<T> o_v,
+                         deviceMemory<T> o_gv,
+                         const int k,
+                         const Transpose trans){
+  AssertGatherDefined();
 
-void ogs_t::GatherScatterFinish    (occa::memory&  o_v,
-                                    const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::occaGatherScatterFinish(o_v, 1, 1, 0, type, op, trans, *this); }
+  deviceMemory<T> o_haloBuf = exchange->o_workspace;
 
-void ogs_t::GatherScatterVecStart  (occa::memory&  o_v, const int k,
-                                    const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::occaGatherScatterStart (o_v, k, 1, 0, type, op, trans, *this); }
+  if (trans==NoTrans) { //if trans!=ogs::NoTrans theres no comms required
+    exchange->AllocBuffer(k*sizeof(T));
 
-void ogs_t::GatherScatterVecFinish (occa::memory&  o_v, const int k,
-                                    const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::occaGatherScatterFinish(o_v, k, 1, 0, type, op, trans, *this); }
+    device_t &device = platform.device;
 
-void ogs_t::GatherScatterManyStart (occa::memory&  o_v, const int k, const dlong stride,
-                                    const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::occaGatherScatterStart (o_v, 1, k, stride, type, op, trans, *this); }
+    if (exchange->gpu_aware) {
+      //collect halo buffer
+      o_haloBuf.copyFrom(o_gv + k*NlocalT,
+                         k*NhaloP, 0, properties_t("async", true));
 
-void ogs_t::GatherScatterManyFinish(occa::memory&  o_v, const int k, const dlong stride,
-                                    const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::occaGatherScatterFinish(o_v, 1, k, stride, type, op, trans, *this); }
+      //wait for o_haloBuf to be ready
+      device.finish();
 
-void ogs_t::GatherStart     (occa::memory&  o_gv, occa::memory&  o_v,
-                             const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::occaGatherStart (o_gv, o_v, 1, 1, 0, 0, type, op, trans, *this); }
+      //prepare MPI exchange
+      exchange->Start(o_haloBuf, k, Add, NoTrans);
+    } else {
+      //get current stream
+      stream_t currentStream = device.getStream();
 
-void ogs_t::GatherFinish    (occa::memory&  o_gv, occa::memory&  o_v,
-                             const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::occaGatherFinish(o_gv, o_v, 1, 1, 0, 0, type, op, trans, *this); }
+      //if not using gpu-aware mpi move the halo buffer to the host
+      pinnedMemory<T> haloBuf = exchange->h_workspace;
 
-void ogs_t::GatherVecStart  (occa::memory&  o_gv, occa::memory&  o_v, const int k,
-                             const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::occaGatherStart (o_gv, o_v, k, 1, 0, 0, type, op, trans, *this); }
+      //wait for o_gv to be ready
+      device.finish();
 
-void ogs_t::GatherVecFinish (occa::memory&  o_gv, occa::memory&  o_v, const int k,
-                             const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::occaGatherFinish(o_gv, o_v, k, 1, 0, 0, type, op, trans, *this); }
+      //queue copy to host
+      device.setStream(dataStream);
+      haloBuf.copyFrom(o_gv + k*NlocalT, NhaloP*k,
+                       0, properties_t("async", true));
+      device.setStream(currentStream);
+    }
+  }
+}
 
-void ogs_t::GatherManyStart (occa::memory&  o_gv, occa::memory&  o_v, const int k,
-                             const dlong gstride, const dlong stride,
-                             const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::occaGatherStart (o_gv, o_v, 1, k, gstride, stride, type, op, trans, *this); }
+template<typename T>
+void ogs_t::ScatterFinish(deviceMemory<T> o_v,
+                          deviceMemory<T> o_gv,
+                          const int k,
+                          const Transpose trans){
+  AssertGatherDefined();
+
+  deviceMemory<T> o_haloBuf = exchange->o_workspace;
+
+  //queue local s operation
+  gatherLocal->Scatter(o_v, o_gv, k, trans);
+
+  if (trans==NoTrans) { //if trans!=ogs::NoTrans theres no comms required
+    if (exchange->gpu_aware) {
+      //finish MPI exchange
+      exchange->Finish(o_haloBuf, k, Add, NoTrans);
+    } else {
+      pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+      //get current stream
+      device_t &device = platform.device;
+      stream_t currentStream = device.getStream();
+
+      //synchronize data stream to ensure the buffer is on the host
+      device.setStream(dataStream);
+      device.finish();
+
+      /*MPI exchange of host buffer*/
+      exchange->Start (haloBuf, k, Add, NoTrans);
+      exchange->Finish(haloBuf, k, Add, NoTrans);
+
+      // copy recv back to device
+      haloBuf.copyTo(o_haloBuf, NhaloT*k,
+                     0, properties_t("async", true));
+      device.finish(); //wait for transfer to finish
+      device.setStream(currentStream);
+    }
+
+    //scatter halo buffer
+    gatherHalo->Scatter(o_v, o_haloBuf, k, NoTrans);
+  } else {
+    //scatter halo
+    gatherHalo->Scatter(o_v, o_gv + k*NlocalT, k, trans);
+  }
+}
 
-void ogs_t::GatherManyFinish(occa::memory&  o_gv, occa::memory&  o_v, const int k,
-                             const dlong gstride, const dlong stride,
-                             const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::occaGatherFinish(o_gv, o_v, 1, k, gstride, stride, type, op, trans, *this); }
+template
+void ogs_t::Scatter(deviceMemory<float> v, const deviceMemory<float> gv,
+                    const int k, const Transpose trans);
+template
+void ogs_t::Scatter(deviceMemory<double> v, const deviceMemory<double> gv,
+                    const int k, const Transpose trans);
+template
+void ogs_t::Scatter(deviceMemory<int> v, const deviceMemory<int> gv,
+                    const int k, const Transpose trans);
+template
+void ogs_t::Scatter(deviceMemory<long long int> v, const deviceMemory<long long int> gv,
+                    const int k, const Transpose trans);
+
+/********************************
+ * Host Scatter
+ ********************************/
+
+//host versions
+template<typename T>
+void ogs_t::Scatter(memory<T> v,
+                    const memory<T> gv,
+                    const int k,
+                    const Transpose trans){
+  ScatterStart (v, gv, k, trans);
+  ScatterFinish(v, gv, k, trans);
+}
 
-void ogs_t::ScatterStart     (occa::memory&  o_v, occa::memory&  o_gv,
-                              const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::occaScatterStart (o_v, o_gv, 1, 1, 0, 0, type, op, trans, *this); }
+template<typename T>
+void ogs_t::ScatterStart(memory<T> v,
+                         const memory<T> gv,
+                         const int k,
+                         const Transpose trans){
+  AssertGatherDefined();
 
-void ogs_t::ScatterFinish    (occa::memory&  o_v, occa::memory&  o_gv,
-                              const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::occaScatterFinish(o_v, o_gv, 1, 1, 0, 0, type, op, trans, *this); }
+  if (trans==NoTrans) { //if trans!=ogs::NoTrans theres no comms required
+    exchange->AllocBuffer(k*sizeof(T));
 
-void ogs_t::ScatterVecStart  (occa::memory&  o_v, occa::memory&  o_gv, const int k,
-                              const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::occaScatterStart (o_v, o_gv, k, 1, 0, 0, type, op, trans, *this); }
+    /*Cast workspace to type T*/
+    pinnedMemory<T> haloBuf = exchange->h_workspace;
 
-void ogs_t::ScatterVecFinish (occa::memory&  o_v, occa::memory&  o_gv, const int k,
-                              const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::occaScatterFinish(o_v, o_gv, k, 1, 0, 0, type, op, trans, *this); }
+    //collect halo buffer
+    haloBuf.copyFrom(gv + k*NlocalT, k*NhaloP);
 
-void ogs_t::ScatterManyStart (occa::memory&  o_v, occa::memory&  o_gv, const int k,
-                              const dlong stride, const dlong gstride,
-                              const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::occaScatterStart (o_v, o_gv, 1, k, stride, gstride, type, op, trans, *this); }
+    //prepare MPI exchange
+    exchange->Start(haloBuf, k, Add, NoTrans);
+  }
+}
 
-void ogs_t::ScatterManyFinish(occa::memory&  o_v, occa::memory&  o_gv, const int k,
-                              const dlong stride, const dlong gstride,
-                              const ogs_type type, const ogs_op op, const ogs_transpose trans)
-{ ogs::occaScatterFinish(o_v, o_gv, 1, k, stride, gstride, type, op, trans, *this); }
+template<typename T>
+void ogs_t::ScatterFinish(memory<T> v,
+                          const memory<T> gv,
+                          const int k,
+                          const Transpose trans){
+  AssertGatherDefined();
+
+  //queue local s operation
+  gatherLocal->Scatter(v, gv, k, trans);
+
+  if (trans==NoTrans) { //if trans!=ogs::NoTrans theres no comms required
+    /*Cast workspace to type T*/
+    pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+    //finish MPI exchange (and put the result at the end of o_gv)
+    exchange->Finish(haloBuf, k, Add, NoTrans);
+
+    //scatter halo buffer
+    gatherHalo->Scatter(v, haloBuf, k, NoTrans);
+  } else {
+    //scatter halo
+    gatherHalo->Scatter(v, gv + k*NlocalT, k, trans);
+  }
+}
 
-void ogs_t::Unique(hlong *ids, dlong _N, MPI_Comm _comm) {
-  ogs::gsUnique(ids, _N, _comm);
-}
\ No newline at end of file
+template
+void ogs_t::Scatter(memory<float> v, const memory<float> gv,
+                    const int k, const Transpose trans);
+template
+void ogs_t::Scatter(memory<double> v, const memory<double> gv,
+                    const int k, const Transpose trans);
+template
+void ogs_t::Scatter(memory<int> v, const memory<int> gv,
+                    const int k, const Transpose trans);
+template
+void ogs_t::Scatter(memory<long long int> v, const memory<long long int> gv,
+                    const int k, const Transpose trans);
+} //namespace ogs
+
+} //namespace libp
diff --git a/libs/ogs/ogsAllToAll.cpp b/libs/ogs/ogsAllToAll.cpp
new file mode 100644
index 000000000..051ffa174
--- /dev/null
+++ b/libs/ogs/ogsAllToAll.cpp
@@ -0,0 +1,358 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "ogs.hpp"
+#include "ogs/ogsUtils.hpp"
+#include "ogs/ogsExchange.hpp"
+
+#ifdef GLIBCXX_PARALLEL
+#include <parallel/algorithm>
+using __gnu_parallel::sort;
+#else
+using std::sort;
+#endif
+
+namespace libp {
+
+namespace ogs {
+
+/**********************************
+* Host exchange
+***********************************/
+template<typename T>
+inline void ogsAllToAll_t::Start(pinnedMemory<T> &buf, const int k,
+                          const Op op, const Transpose trans){
+
+  pinnedMemory<T> sendBuf = h_sendspace;
+
+  // extract the send buffer
+  if (trans == NoTrans)
+    extract(NsendN, k, sendIdsN, buf, sendBuf);
+  else
+    extract(NsendT, k, sendIdsT, buf, sendBuf);
+
+  if (trans==NoTrans) {
+    for (int r=0;r<size;++r) {
+      sendCounts[r] = k*mpiSendCountsN[r];
+      recvCounts[r] = k*mpiRecvCountsN[r];
+      sendOffsets[r+1] = k*mpiSendOffsetsN[r+1];
+      recvOffsets[r+1] = k*mpiRecvOffsetsN[r+1];
+    }
+  } else {
+    for (int r=0;r<size;++r) {
+      sendCounts[r] = k*mpiSendCountsT[r];
+      recvCounts[r] = k*mpiRecvCountsT[r];
+      sendOffsets[r+1] = k*mpiSendOffsetsT[r+1];
+      recvOffsets[r+1] = k*mpiRecvOffsetsT[r+1];
+    }
+  }
+
+  // collect everything needed with single MPI all to all
+  comm.Ialltoallv(sendBuf,     sendCounts, sendOffsets,
+                  buf+Nhalo*k, recvCounts, recvOffsets,
+                  request);
+}
+
+template<typename T>
+inline void ogsAllToAll_t::Finish(pinnedMemory<T> &buf, const int k,
+                           const Op op, const Transpose trans){
+
+  comm.Wait(request);
+
+  //if we recvieved anything via MPI, gather the recv buffer and scatter
+  // it back to to original vector
+  dlong Nrecv = recvOffsets[size];
+  if (Nrecv) {
+    // gather the recieved nodes
+    postmpi.Gather(buf, buf, k, op, trans);
+  }
+}
+
+void ogsAllToAll_t::Start(pinnedMemory<float> &buf, const int k, const Op op, const Transpose trans) { Start<float>(buf, k, op, trans); }
+void ogsAllToAll_t::Start(pinnedMemory<double> &buf, const int k, const Op op, const Transpose trans) { Start<double>(buf, k, op, trans); }
+void ogsAllToAll_t::Start(pinnedMemory<int> &buf, const int k, const Op op, const Transpose trans) { Start<int>(buf, k, op, trans); }
+void ogsAllToAll_t::Start(pinnedMemory<long long int> &buf, const int k, const Op op, const Transpose trans) { Start<long long int>(buf, k, op, trans); }
+void ogsAllToAll_t::Finish(pinnedMemory<float> &buf, const int k, const Op op, const Transpose trans) { Finish<float>(buf, k, op, trans); }
+void ogsAllToAll_t::Finish(pinnedMemory<double> &buf, const int k, const Op op, const Transpose trans) { Finish<double>(buf, k, op, trans); }
+void ogsAllToAll_t::Finish(pinnedMemory<int> &buf, const int k, const Op op, const Transpose trans) { Finish<int>(buf, k, op, trans); }
+void ogsAllToAll_t::Finish(pinnedMemory<long long int> &buf, const int k, const Op op, const Transpose trans) { Finish<long long int>(buf, k, op, trans); }
+
+/**********************************
+* GPU-aware exchange
+***********************************/
+template<typename T>
+void ogsAllToAll_t::Start(deviceMemory<T> &o_buf,
+                          const int k,
+                          const Op op,
+                          const Transpose trans){
+
+  const dlong Nsend = (trans == NoTrans) ? NsendN : NsendT;
+
+  if (Nsend) {
+    deviceMemory<T> o_sendBuf = o_sendspace;
+
+    // assemble the send buffer on device
+    if (trans == NoTrans) {
+      extractKernel[ogsType<T>::get()](NsendN, k, o_sendIdsN, o_buf, o_sendBuf);
+    } else {
+      extractKernel[ogsType<T>::get()](NsendT, k, o_sendIdsT, o_buf, o_sendBuf);
+    }
+    //wait for kernel to finish on default stream
+    device_t &device = platform.device;
+    device.finish();
+  }
+}
+
+template<typename T>
+void ogsAllToAll_t::Finish(deviceMemory<T> &o_buf,
+                           const int k,
+                           const Op op,
+                           const Transpose trans){
+
+  deviceMemory<T> o_sendBuf = o_sendspace;
+
+  if (trans==NoTrans) {
+    for (int r=0;r<size;++r) {
+      sendCounts[r] = k*mpiSendCountsN[r];
+      recvCounts[r] = k*mpiRecvCountsN[r];
+      sendOffsets[r+1] = k*mpiSendOffsetsN[r+1];
+      recvOffsets[r+1] = k*mpiRecvOffsetsN[r+1];
+    }
+  } else {
+    for (int r=0;r<size;++r) {
+      sendCounts[r] = k*mpiSendCountsT[r];
+      recvCounts[r] = k*mpiRecvCountsT[r];
+      sendOffsets[r+1] = k*mpiSendOffsetsT[r+1];
+      recvOffsets[r+1] = k*mpiRecvOffsetsT[r+1];
+    }
+  }
+
+  // collect everything needed with single MPI all to all
+  comm.Alltoallv(o_sendBuf,     sendCounts, sendOffsets,
+                 o_buf+Nhalo*k, recvCounts, recvOffsets);
+
+  //if we recvieved anything via MPI, gather the recv buffer and scatter
+  // it back to to original vector
+  dlong Nrecv = recvOffsets[size];
+  if (Nrecv) {
+    // gather the recieved nodes on device
+    postmpi.Gather(o_buf, o_buf, k, op, trans);
+  }
+}
+
+void ogsAllToAll_t::Start(deviceMemory<float> &buf, const int k, const Op op, const Transpose trans) { Start<float>(buf, k, op, trans); }
+void ogsAllToAll_t::Start(deviceMemory<double> &buf, const int k, const Op op, const Transpose trans) { Start<double>(buf, k, op, trans); }
+void ogsAllToAll_t::Start(deviceMemory<int> &buf, const int k, const Op op, const Transpose trans) { Start<int>(buf, k, op, trans); }
+void ogsAllToAll_t::Start(deviceMemory<long long int> &buf, const int k, const Op op, const Transpose trans) { Start<long long int>(buf, k, op, trans); }
+void ogsAllToAll_t::Finish(deviceMemory<float> &buf, const int k, const Op op, const Transpose trans) { Finish<float>(buf, k, op, trans); }
+void ogsAllToAll_t::Finish(deviceMemory<double> &buf, const int k, const Op op, const Transpose trans) { Finish<double>(buf, k, op, trans); }
+void ogsAllToAll_t::Finish(deviceMemory<int> &buf, const int k, const Op op, const Transpose trans) { Finish<int>(buf, k, op, trans); }
+void ogsAllToAll_t::Finish(deviceMemory<long long int> &buf, const int k, const Op op, const Transpose trans) { Finish<long long int>(buf, k, op, trans); }
+
+ogsAllToAll_t::ogsAllToAll_t(dlong Nshared,
+                             memory<parallelNode_t> &sharedNodes,
+                             ogsOperator_t& gatherHalo,
+                             stream_t _dataStream,
+                             comm_t _comm,
+                             platform_t &_platform):
+  ogsExchange_t(_platform,_comm, _dataStream) {
+
+  Nhalo  = gatherHalo.NrowsT;
+  NhaloP = gatherHalo.NrowsN;
+
+  // sort the list by rank to the order where they will be sent by MPI_Allgatherv
+  sort(sharedNodes.ptr(), sharedNodes.ptr()+Nshared,
+       [](const parallelNode_t& a, const parallelNode_t& b) {
+         if(a.rank < b.rank) return true; //group by rank
+         if(a.rank > b.rank) return false;
+
+         return a.newId < b.newId; //then order by the localId relative to this rank
+       });
+
+  //make mpi allgatherv counts and offsets
+  mpiSendCountsT.calloc(size);
+  mpiSendCountsN.calloc(size);
+  mpiRecvCountsT.malloc(size);
+  mpiRecvCountsN.malloc(size);
+  mpiSendOffsetsT.malloc(size+1);
+  mpiSendOffsetsN.malloc(size+1);
+  mpiRecvOffsetsN.malloc(size+1);
+  mpiRecvOffsetsT.malloc(size+1);
+
+  for (dlong n=0;n<Nshared;n++) { //loop through nodes we need to send
+    const int r = sharedNodes[n].rank;
+    if (sharedNodes[n].sign>0) mpiSendCountsN[r]++;
+    mpiSendCountsT[r]++;
+  }
+
+  //shared counts
+  comm.Alltoall(mpiSendCountsT, mpiRecvCountsT);
+  comm.Alltoall(mpiSendCountsN, mpiRecvCountsN);
+
+  //cumulative sum
+  mpiSendOffsetsN[0] = 0;
+  mpiSendOffsetsT[0] = 0;
+  mpiRecvOffsetsN[0] = 0;
+  mpiRecvOffsetsT[0] = 0;
+  for (int r=0;r<size;r++) {
+    mpiSendOffsetsN[r+1] = mpiSendOffsetsN[r]+mpiSendCountsN[r];
+    mpiSendOffsetsT[r+1] = mpiSendOffsetsT[r]+mpiSendCountsT[r];
+    mpiRecvOffsetsN[r+1] = mpiRecvOffsetsN[r]+mpiRecvCountsN[r];
+    mpiRecvOffsetsT[r+1] = mpiRecvOffsetsT[r]+mpiRecvCountsT[r];
+  }
+
+  //make ops for scattering halo nodes before sending
+  NsendN=mpiSendOffsetsN[size];
+  NsendT=mpiSendOffsetsT[size];
+
+  sendIdsN.malloc(NsendN);
+  sendIdsT.malloc(NsendT);
+
+  NsendN=0; //positive node count
+  NsendT=0; //all node count
+
+  for (dlong n=0;n<Nshared;n++) { //loop through nodes we need to send
+    dlong id = sharedNodes[n].newId; //coalesced index for this baseId on this rank
+    if (sharedNodes[n].sign==2) {
+      sendIdsN[NsendN++] = id;
+    }
+    sendIdsT[NsendT++] = id;
+  }
+  o_sendIdsT = platform.malloc(sendIdsT);
+  o_sendIdsN = platform.malloc(sendIdsN);
+
+  //send the node lists so we know what we'll receive
+  dlong Nrecv = mpiRecvOffsetsT[size];
+  memory<parallelNode_t> recvNodes(Nrecv);
+
+  //Send list of nodes to each rank
+  comm.Alltoallv(sharedNodes, mpiSendCountsT, mpiSendOffsetsT,
+                   recvNodes, mpiRecvCountsT, mpiRecvOffsetsT);
+
+  //make ops for gathering halo nodes after an MPI_Allgatherv
+  postmpi.platform = platform;
+  postmpi.kind = Signed;
+
+  postmpi.NrowsN = Nhalo;
+  postmpi.NrowsT = Nhalo;
+  postmpi.rowStartsN.malloc(Nhalo+1);
+  postmpi.rowStartsT.malloc(Nhalo+1);
+
+  //make array of counters
+  memory<dlong> haloGatherTCounts(Nhalo);
+  memory<dlong> haloGatherNCounts(Nhalo);
+
+  //count the data that will already be in h_haloBuf.ptr()
+  for (dlong n=0;n<Nhalo;n++) {
+    haloGatherNCounts[n] = (n<NhaloP) ? 1 : 0;
+    haloGatherTCounts[n] = 1;
+  }
+
+  for (dlong n=0;n<Nrecv;n++) { //loop through nodes needed for gathering halo nodes
+    dlong id = recvNodes[n].localId; //coalesced index for this baseId on this rank
+    if (recvNodes[n].sign==2) haloGatherNCounts[id]++;  //tally
+    haloGatherTCounts[id]++;  //tally
+  }
+
+  postmpi.rowStartsN[0] = 0;
+  postmpi.rowStartsT[0] = 0;
+  for (dlong i=0;i<Nhalo;i++) {
+    postmpi.rowStartsN[i+1] = postmpi.rowStartsN[i] + haloGatherNCounts[i];
+    postmpi.rowStartsT[i+1] = postmpi.rowStartsT[i] + haloGatherTCounts[i];
+    haloGatherNCounts[i] = 0;
+    haloGatherTCounts[i] = 0;
+  }
+  postmpi.nnzN = postmpi.rowStartsN[Nhalo];
+  postmpi.nnzT = postmpi.rowStartsT[Nhalo];
+  postmpi.colIdsN.malloc(postmpi.nnzN);
+  postmpi.colIdsT.malloc(postmpi.nnzT);
+
+  for (dlong n=0;n<NhaloP;n++) {
+    const dlong soffset = postmpi.rowStartsN[n];
+    const int sindex  = haloGatherNCounts[n];
+    postmpi.colIdsN[soffset+sindex] = n; //record id
+    haloGatherNCounts[n]++;
+  }
+  for (dlong n=0;n<Nhalo;n++) {
+    const dlong soffset = postmpi.rowStartsT[n];
+    const int sindex  = haloGatherTCounts[n];
+    postmpi.colIdsT[soffset+sindex] = n; //record id
+    haloGatherTCounts[n]++;
+  }
+
+  dlong cnt=Nhalo; //positive node count
+  for (dlong n=0;n<Nrecv;n++) { //loop through nodes we need to send
+    dlong id = recvNodes[n].localId; //coalesced index for this baseId on this rank
+    if (recvNodes[n].sign==2) {
+      const dlong soffset = postmpi.rowStartsN[id];
+      const int sindex  = haloGatherNCounts[id];
+      postmpi.colIdsN[soffset+sindex] = cnt++; //record id
+      haloGatherNCounts[id]++;
+    }
+    const dlong soffset = postmpi.rowStartsT[id];
+    const int sindex  = haloGatherTCounts[id];
+    postmpi.colIdsT[soffset+sindex] = n + Nhalo; //record id
+    haloGatherTCounts[id]++;
+  }
+
+  postmpi.o_rowStartsN = platform.malloc(postmpi.rowStartsN);
+  postmpi.o_rowStartsT = platform.malloc(postmpi.rowStartsT);
+  postmpi.o_colIdsN = platform.malloc(postmpi.colIdsN);
+  postmpi.o_colIdsT = platform.malloc(postmpi.colIdsT);
+
+  //free up space
+  recvNodes.free();
+  haloGatherNCounts.free();
+  haloGatherTCounts.free();
+
+  postmpi.setupRowBlocks();
+
+  sendCounts.malloc(size);
+  recvCounts.malloc(size);
+  sendOffsets.malloc(size+1);
+  recvOffsets.malloc(size+1);
+
+  sendOffsets[0]=0;
+  recvOffsets[0]=0;
+
+  //make scratch space
+  AllocBuffer(sizeof(dfloat));
+}
+
+void ogsAllToAll_t::AllocBuffer(size_t Nbytes) {
+  if (o_workspace.size() < postmpi.nnzT*Nbytes) {
+    h_workspace = platform.hostMalloc<char>(postmpi.nnzT*Nbytes);
+    o_workspace = platform.malloc<char>(postmpi.nnzT*Nbytes);
+  }
+  if (o_sendspace.size() < NsendT*Nbytes) {
+    h_sendspace = platform.hostMalloc<char>(NsendT*Nbytes);
+    o_sendspace = platform.malloc<char>(NsendT*Nbytes);
+  }
+}
+
+} //namespace ogs
+
+} //namespace libp
diff --git a/libs/ogs/ogsAuto.cpp b/libs/ogs/ogsAuto.cpp
new file mode 100644
index 000000000..42868e969
--- /dev/null
+++ b/libs/ogs/ogsAuto.cpp
@@ -0,0 +1,349 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "ogs.hpp"
+#include "ogs/ogsUtils.hpp"
+#include "ogs/ogsOperator.hpp"
+#include "ogs/ogsExchange.hpp"
+#include "timer.hpp"
+
+namespace libp {
+
+namespace ogs {
+
+static void DeviceExchangeTest(ogsExchange_t* exchange, double time[3]) {
+  const int Ncold = 10;
+  const int Nhot  = 10;
+  double localTime, sumTime, minTime, maxTime;
+
+  comm_t& comm = exchange->comm;
+  int size = comm.size();
+
+  pinnedMemory<dfloat>   buf = exchange->h_workspace;
+  deviceMemory<dfloat> o_buf = exchange->o_workspace;
+
+  device_t &device = exchange->platform.device;
+
+  //dry run
+  for (int n=0;n<Ncold;++n) {
+    if (exchange->gpu_aware) {
+      /*GPU-aware exchange*/
+      exchange->Start (o_buf, 1, Add, Sym);
+      exchange->Finish(o_buf, 1, Add, Sym);
+    } else {
+      //if not using gpu-aware mpi move the halo buffer to the host
+      o_buf.copyTo(buf, exchange->Nhalo,
+                   0, properties_t("async", true));
+      device.finish();
+
+      /*MPI exchange of host buffer*/
+      exchange->Start (buf, 1, Add, Sym);
+      exchange->Finish(buf, 1, Add, Sym);
+
+      // copy recv back to device
+      o_buf.copyFrom(buf, exchange->Nhalo,
+                     0, properties_t("async", true));
+      device.finish(); //wait for transfer to finish
+    }
+  }
+
+  //hot runs
+  timePoint_t start = Time();
+  for (int n=0;n<Nhot;++n) {
+    if (exchange->gpu_aware) {
+      /*GPU-aware exchange*/
+      exchange->Start (o_buf, 1, Add, Sym);
+      exchange->Finish(o_buf, 1, Add, Sym);
+    } else {
+      //if not using gpu-aware mpi move the halo buffer to the host
+      o_buf.copyTo(buf, exchange->Nhalo,
+                   0, properties_t("async", true));
+      device.finish();
+
+      /*MPI exchange of host buffer*/
+      exchange->Start (buf, 1, Add, Sym);
+      exchange->Finish(buf, 1, Add, Sym);
+
+      // copy recv back to device
+      o_buf.copyFrom(buf, exchange->Nhalo,
+                     0, properties_t("async", true));
+      device.finish(); //wait for transfer to finish
+    }
+  }
+  timePoint_t end = Time();
+
+  localTime = ElapsedTime(start,end)/Nhot;
+  comm.Allreduce(localTime, sumTime, Comm::Sum);
+  comm.Allreduce(localTime, maxTime, Comm::Max);
+  comm.Allreduce(localTime, minTime, Comm::Min);
+
+  time[0] = sumTime/size; //avg
+  time[1] = minTime;      //min
+  time[2] = maxTime;      //max
+}
+
+static void HostExchangeTest(ogsExchange_t* exchange, double time[3]) {
+  const int Ncold = 10;
+  const int Nhot  = 10;
+  double localTime, sumTime, minTime, maxTime;
+
+  comm_t& comm = exchange->comm;
+  int size = comm.size();
+
+  pinnedMemory<dfloat> buf = exchange->h_workspace;
+
+  //dry run
+  for (int n=0;n<Ncold;++n) {
+    exchange->Start (buf, 1, Add, Sym);
+    exchange->Finish(buf, 1, Add, Sym);
+  }
+
+  //hot runs
+  timePoint_t start = Time();
+  for (int n=0;n<Nhot;++n) {
+    exchange->Start (buf, 1, Add, Sym);
+    exchange->Finish(buf, 1, Add, Sym);
+  }
+  timePoint_t end = Time();
+
+  localTime = ElapsedTime(start,end)/Nhot;
+  comm.Allreduce(localTime, sumTime, Comm::Sum);
+  comm.Allreduce(localTime, maxTime, Comm::Max);
+  comm.Allreduce(localTime, minTime, Comm::Min);
+
+  time[0] = sumTime/size; //avg
+  time[1] = minTime;      //min
+  time[2] = maxTime;      //max
+}
+
+ogsExchange_t* ogsBase_t::AutoSetup(dlong Nshared,
+                                    memory<parallelNode_t> &sharedNodes,
+                                    ogsOperator_t& _gatherHalo,
+                                    comm_t _comm,
+                                    platform_t &_platform,
+                                    const int verbose) {
+
+  int rank, size;
+  rank = comm.rank();
+  size = comm.size();
+
+  if (size==1) return new ogsPairwise_t(Nshared, sharedNodes,
+                                        _gatherHalo, dataStream,
+                                        comm, platform);
+
+  ogsExchange_t* bestExchange;
+  Method method;
+  double bestTime;
+
+#ifdef GPU_AWARE_MPI
+  if (rank==0 && verbose)
+    printf("   Method         Device Exchange (avg, min, max)  Device Exchange (GPU-aware)      Host Exchange \n");
+#else
+  if (rank==0 && verbose)
+    printf("   Method         Device Exchange (avg, min, max)  Host Exchange \n");
+#endif
+
+  //Trigger JIT kernel builds
+  InitializeKernels(platform, ogs::Dfloat, ogs::Add);
+
+  /********************************
+   * Pairwise
+   ********************************/
+  ogsExchange_t* pairwise = new ogsPairwise_t(Nshared, sharedNodes,
+                                              _gatherHalo, dataStream,
+                                              comm, platform);
+
+  //standard copy to host - exchange - copy back to device
+  pairwise->gpu_aware=false;
+
+  double pairwiseTime[3];
+  DeviceExchangeTest(pairwise, pairwiseTime);
+  double pairwiseAvg = pairwiseTime[0];
+
+#ifdef GPU_AWARE_MPI
+  //test GPU-aware exchange
+  pairwise->gpu_aware=true;
+
+  double pairwiseGATime[3];
+  DeviceExchangeTest(pairwise, pairwiseGATime);
+
+  if (pairwiseGATime[0] < pairwiseAvg)
+    pairwiseAvg = pairwiseGATime[0];
+  else
+    pairwise->gpu_aware=false;
+
+#endif
+
+  //test exchange from host memory (just for reporting)
+  double pairwiseHostTime[3];
+  HostExchangeTest(pairwise, pairwiseHostTime);
+
+  bestExchange = pairwise;
+  method = Pairwise;
+  bestTime = pairwiseAvg;
+
+#ifdef GPU_AWARE_MPI
+  if (rank==0 && verbose)
+    printf("   Pairwise       %5.3e %5.3e %5.3e    %5.3e %5.3e %5.3e    %5.3e %5.3e %5.3e \n",
+            pairwiseTime[0],     pairwiseTime[1],     pairwiseTime[2],
+            pairwiseGATime[0],   pairwiseGATime[1],   pairwiseGATime[2],
+            pairwiseHostTime[0], pairwiseHostTime[1], pairwiseHostTime[2]);
+#else
+  if (rank==0 && verbose)
+    printf("   Pairwise       %5.3e %5.3e %5.3e    %5.3e %5.3e %5.3e \n",
+            pairwiseTime[0],     pairwiseTime[1],     pairwiseTime[2],
+            pairwiseHostTime[0], pairwiseHostTime[1], pairwiseHostTime[2]);
+#endif
+
+  /********************************
+   * All-to-All
+   ********************************/
+  ogsExchange_t* alltoall = new ogsAllToAll_t(Nshared, sharedNodes,
+                                           _gatherHalo, dataStream,
+                                           comm, platform);
+  //standard copy to host - exchange - copy back to device
+  alltoall->gpu_aware=false;
+
+  double alltoallTime[3];
+  DeviceExchangeTest(alltoall, alltoallTime);
+  double alltoallAvg = alltoallTime[0];
+
+#ifdef GPU_AWARE_MPI
+  //test GPU-aware exchange
+  alltoall->gpu_aware=true;
+
+  double alltoallGATime[3];
+  DeviceExchangeTest(alltoall, alltoallGATime);
+
+  if (alltoallGATime[0] < alltoallAvg)
+    alltoallAvg = alltoallGATime[0];
+  else
+    alltoall->gpu_aware=false;
+
+#endif
+
+  //test exchange from host memory (just for reporting)
+  double alltoallHostTime[3];
+  HostExchangeTest(alltoall, alltoallHostTime);
+
+  if (alltoallAvg < bestTime) {
+    delete bestExchange;
+    bestExchange = alltoall;
+    method = AllToAll;
+    bestTime = alltoallAvg;
+  } else {
+    delete alltoall;
+  }
+
+#ifdef GPU_AWARE_MPI
+  if (rank==0 && verbose)
+    printf("   AllToAll       %5.3e %5.3e %5.3e    %5.3e %5.3e %5.3e    %5.3e %5.3e %5.3e \n",
+            alltoallTime[0],     alltoallTime[1],     alltoallTime[2],
+            alltoallGATime[0],   alltoallGATime[1],   alltoallGATime[2],
+            alltoallHostTime[0], alltoallHostTime[1], alltoallHostTime[2]);
+#else
+  if (rank==0 && verbose)
+    printf("   AllToAll       %5.3e %5.3e %5.3e    %5.3e %5.3e %5.3e \n",
+            alltoallTime[0],     alltoallTime[1],     alltoallTime[2],
+            alltoallHostTime[0], alltoallHostTime[1], alltoallHostTime[2]);
+#endif
+
+  /********************************
+   * Crystal Router
+   ********************************/
+  ogsExchange_t* crystal = new ogsCrystalRouter_t(Nshared, sharedNodes,
+                                                 _gatherHalo, dataStream,
+                                                 comm, platform);
+
+  //standard copy to host - exchange - copy back to device
+  crystal->gpu_aware=false;
+
+  double crystalTime[3];
+  DeviceExchangeTest(crystal, crystalTime);
+  double crystalAvg = crystalTime[0];
+
+#ifdef GPU_AWARE_MPI
+  //test GPU-aware exchange
+  crystal->gpu_aware=true;
+
+  double crystalGATime[3];
+  DeviceExchangeTest(crystal, crystalGATime);
+
+  if (crystalGATime[0] < crystalAvg)
+    crystalAvg = crystalGATime[0];
+  else
+    crystal->gpu_aware=false;
+
+#endif
+
+  //test exchange from host memory (just for reporting)
+  double crystalHostTime[3];
+  HostExchangeTest(crystal, crystalHostTime);
+
+  if (crystalAvg < bestTime) {
+    delete bestExchange;
+    bestExchange = crystal;
+    method = CrystalRouter;
+    bestTime = crystalAvg;
+  } else {
+    delete crystal;
+  }
+
+#ifdef GPU_AWARE_MPI
+  if (rank==0 && verbose)
+    printf("   CrystalRouter  %5.3e %5.3e %5.3e    %5.3e %5.3e %5.3e    %5.3e %5.3e %5.3e \n",
+            crystalTime[0],     crystalTime[1],     crystalTime[2],
+            crystalGATime[0],   crystalGATime[1],   crystalGATime[2],
+            crystalHostTime[0], crystalHostTime[1], crystalHostTime[2]);
+#else
+  if (rank==0 && verbose)
+    printf("   CrystalRouter  %5.3e %5.3e %5.3e    %5.3e %5.3e %5.3e \n",
+            crystalTime[0],     crystalTime[1],     crystalTime[2],
+            crystalHostTime[0], crystalHostTime[1], crystalHostTime[2]);
+#endif
+
+  if (rank==0 && verbose) {
+    switch (method) {
+      case AllToAll:
+        printf("   Exchange method selected: AllToAll"); break;
+      case Pairwise:
+        printf("   Exchange method selected: Pairwise"); break;
+      case CrystalRouter:
+        printf("   Exchange method selected: CrystalRouter"); break;
+      default:
+        break;
+    }
+    if (bestExchange->gpu_aware) printf(" (GPU-aware)");
+    printf("\n");
+  }
+
+  return bestExchange;
+}
+
+
+} //namespace ogs
+
+} //namespace libp
diff --git a/libs/ogs/ogsCrystalRouter.cpp b/libs/ogs/ogsCrystalRouter.cpp
new file mode 100644
index 000000000..f62c836ea
--- /dev/null
+++ b/libs/ogs/ogsCrystalRouter.cpp
@@ -0,0 +1,775 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "ogs.hpp"
+#include "ogs/ogsUtils.hpp"
+#include "ogs/ogsExchange.hpp"
+
+#ifdef GLIBCXX_PARALLEL
+#include <parallel/algorithm>
+using __gnu_parallel::sort;
+#else
+using std::sort;
+#endif
+
+namespace libp {
+
+namespace ogs {
+
+/**********************************
+* Host exchange
+***********************************/
+template<typename T>
+inline void ogsCrystalRouter_t::Start(pinnedMemory<T> &buf, const int k,
+                               const Op op, const Transpose trans){}
+
+template<typename T>
+inline void ogsCrystalRouter_t::Finish(pinnedMemory<T> &buf, const int k,
+                                const Op op, const Transpose trans){
+
+
+  memory<crLevel> levels;
+  if (trans==NoTrans) {
+    levels = levelsN;
+  } else {
+    levels = levelsT;
+  }
+
+  pinnedMemory<T> sendBuf = h_sendspace;
+
+  // To start,    buf = h_workspace = h_work[(hbuf_id+0)%2];
+  //          sendBuf = h_sendspace;
+  for (int l=0;l<Nlevels;l++) {
+    pinnedMemory<T> recvBuf = h_workspace;
+
+    //post recvs
+    if (levels[l].Nmsg>0) {
+      comm.Irecv(recvBuf + levels[l].recvOffset*k,
+                 levels[l].partner,
+                 k*levels[l].Nrecv0,
+                 levels[l].partner,
+                 request[1]);
+    }
+    if (levels[l].Nmsg==2) {
+      comm.Irecv(recvBuf + levels[l].recvOffset*k + levels[l].Nrecv0*k,
+                rank-1,
+                k*levels[l].Nrecv1,
+                rank-1,
+                request[2]);
+    }
+
+    //assemble send buffer
+    extract(levels[l].Nsend, k, levels[l].sendIds, buf, sendBuf);
+
+    //post send
+    comm.Isend(sendBuf,
+               levels[l].partner,
+               k*levels[l].Nsend,
+               rank,
+               request[0]);
+
+    comm.Waitall(levels[l].Nmsg+1, request);
+
+    //rotate buffers
+    h_workspace = h_work[(hbuf_id+1)%2];
+    hbuf_id = (hbuf_id+1)%2;
+
+    recvBuf = buf;
+    buf     = h_workspace;
+
+    //Gather the recv'd values into the haloBuffer
+    levels[l].gather.Gather(buf, recvBuf, k, op, Trans);
+  }
+}
+
+void ogsCrystalRouter_t::Start(pinnedMemory<float> &buf, const int k, const Op op, const Transpose trans) { Start<float>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Start(pinnedMemory<double> &buf, const int k, const Op op, const Transpose trans) { Start<double>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Start(pinnedMemory<int> &buf, const int k, const Op op, const Transpose trans) { Start<int>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Start(pinnedMemory<long long int> &buf, const int k, const Op op, const Transpose trans) { Start<long long int>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Finish(pinnedMemory<float> &buf, const int k, const Op op, const Transpose trans) { Finish<float>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Finish(pinnedMemory<double> &buf, const int k, const Op op, const Transpose trans) { Finish<double>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Finish(pinnedMemory<int> &buf, const int k, const Op op, const Transpose trans) { Finish<int>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Finish(pinnedMemory<long long int> &buf, const int k, const Op op, const Transpose trans) { Finish<long long int>(buf, k, op, trans); }
+
+/**********************************
+* GPU-aware exchange
+***********************************/
+template<typename T>
+inline void ogsCrystalRouter_t::Start(deviceMemory<T> &o_buf,
+                                      const int k,
+                                      const Op op,
+                                      const Transpose trans){
+}
+
+template<typename T>
+inline void ogsCrystalRouter_t::Finish(deviceMemory<T> &o_buf,
+                                       const int k,
+                                       const Op op,
+                                       const Transpose trans){
+
+  device_t &device = platform.device;
+
+  //get current stream
+  stream_t currentStream = device.getStream();
+
+  //the intermediate kernels are always overlapped with the default stream
+  device.setStream(dataStream);
+
+  memory<crLevel> levels;
+  if (trans==NoTrans) {
+    levels = levelsN;
+  } else {
+    levels = levelsT;
+  }
+
+  deviceMemory<T> o_sendBuf = o_sendspace;
+
+  // To start,    o_buf = o_workspace = o_work[(buf_id+0)%2];
+  //          o_sendBuf = o_sendspace
+  for (int l=0;l<Nlevels;l++) {
+    deviceMemory<T> o_recvBuf = o_workspace;
+
+    //post recvs
+    if (levels[l].Nmsg>0) {
+      comm.Irecv(o_recvBuf + levels[l].recvOffset*k,
+                 levels[l].partner,
+                 k*levels[l].Nrecv0,
+                 levels[l].partner,
+                 request[1]);
+    }
+    if (levels[l].Nmsg==2) {
+      comm.Irecv(o_recvBuf + levels[l].recvOffset*k + levels[l].Nrecv0*k,
+                rank-1,
+                k*levels[l].Nrecv1,
+                rank-1,
+                request[2]);
+    }
+
+    //assemble send buffer
+    if (levels[l].Nsend) {
+      extractKernel[ogsType<T>::get()](levels[l].Nsend, k,
+                                       levels[l].o_sendIds,
+                                       o_buf, o_sendBuf);
+      device.finish();
+    }
+
+    //post send
+    comm.Isend(o_sendBuf,
+               levels[l].partner,
+               k*levels[l].Nsend,
+               rank,
+               request[0]);
+
+    comm.Waitall(levels[l].Nmsg+1, request);
+
+    //rotate buffers
+    o_workspace = o_work[(buf_id+1)%2];
+    buf_id = (buf_id+1)%2;
+
+    o_recvBuf = o_buf;
+    o_buf  = o_workspace;
+
+    //Gather the recv'd values into the haloBuffer
+    levels[l].gather.Gather(o_buf, o_recvBuf, k, op, Trans);
+  }
+
+  device.setStream(currentStream);
+}
+
+void ogsCrystalRouter_t::Start(deviceMemory<float> &buf, const int k, const Op op, const Transpose trans) { Start<float>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Start(deviceMemory<double> &buf, const int k, const Op op, const Transpose trans) { Start<double>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Start(deviceMemory<int> &buf, const int k, const Op op, const Transpose trans) { Start<int>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Start(deviceMemory<long long int> &buf, const int k, const Op op, const Transpose trans) { Start<long long int>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Finish(deviceMemory<float> &buf, const int k, const Op op, const Transpose trans) { Finish<float>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Finish(deviceMemory<double> &buf, const int k, const Op op, const Transpose trans) { Finish<double>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Finish(deviceMemory<int> &buf, const int k, const Op op, const Transpose trans) { Finish<int>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Finish(deviceMemory<long long int> &buf, const int k, const Op op, const Transpose trans) { Finish<long long int>(buf, k, op, trans); }
+
+
+/*
+ *Crystal Router performs the needed MPI communcation via recursive
+ * folding of a hypercube. Consider a set of NP ranks. We select a
+ * pivot point n_half=(NP+1)/2, and pair all ranks r<n_half (called
+ * lo half) the with ranks r>=n_half (called the hi half), as follows
+ *
+ *                0 <--> NP-1
+ *                1 <--> NP-2
+ *                2 <--> NP-3
+ *                  * * *
+ *         n_half-2 <--> NP-n_half+1
+ *         n_half-1 <--> NP-n_half
+ *
+ * The communication can then be summarized thusly: if a rank in the lo
+ * half has data needed by *any* rank in the hi half, it sends this data
+ * to its hi partner, and analogously for ranks in the hi half. Each rank
+ * therefore sends/receives a single message to/from its partner.
+ *
+ * The communication then proceeds recursively, applying the same folding
+ * proceedure to the lo and hi halves seperately, and stopping when the size
+ * of the local NP reaches 1.
+ *
+ * In the case where NP is odd, n_half-1 == NP-n_half and rank n_half-1 has
+ * no partner to communicate with. In this case, we assign rank r to the
+ * lo half of ranks, and rank n_half-1 sends its data to rank n_half (and
+ * receives no message, as rank n_half-2 is receiving all rank n_half's data).
+
+ * To perform the Crystal Router exchange, each rank gathers its halo nodes to
+ * a coalesced buffer. At each step in the crystal router, a send buffer is
+ * gathered from this buffer and sent to the rank's partner. Simultaneously, a
+ * buffer is received from the rank's partner. This receive buffer is scattered
+ * and added into the coalesced halo buffer. After all commincation is complete
+ * the halo nodes are scattered back to the output array.
+ */
+
+ogsCrystalRouter_t::ogsCrystalRouter_t(dlong Nshared,
+                                       memory<parallelNode_t> &sharedNodes,
+                                       ogsOperator_t& gatherHalo,
+                                       stream_t _dataStream,
+                                       comm_t _comm,
+                                       platform_t &_platform):
+  ogsExchange_t(_platform,_comm,_dataStream) {
+
+  NhaloP = gatherHalo.NrowsN;
+  Nhalo  = gatherHalo.NrowsT;
+
+  //first count how many levels we need
+  Nlevels = 0;
+  int np = size;
+  int np_offset=0;
+  while (np>1) {
+    int np_half = (np+1)/2;
+    int r_half = np_half + np_offset;
+
+    int is_lo = (rank<r_half) ? 1 : 0;
+
+    //Shrink the size of the hypercube
+    if (is_lo) {
+      np = np_half;
+    } else {
+      np -= np_half;
+      np_offset = r_half;
+    }
+    Nlevels++;
+  }
+  levelsN.malloc(Nlevels);
+  levelsT.malloc(Nlevels);
+
+  request.malloc(3);
+
+  //Now build the levels
+  Nlevels = 0;
+  np = size;
+  np_offset=0;
+
+  dlong N = Nshared + Nhalo;
+  memory<parallelNode_t> nodes(N);
+
+  //setup is easier if we include copies of the nodes we own
+  // in the list of shared nodes
+  for(dlong n=0;n<Nhalo;++n) {
+    nodes[n].newId = n;
+    nodes[n].sign  = (n<NhaloP) ? 2 : -2;
+    nodes[n].baseId = 0;
+    nodes[n].rank = rank;
+  }
+  for(dlong n=0;n<Nshared;++n) {
+    const dlong newId = sharedNodes[n].newId;
+    if (nodes[newId].baseId==0) {
+      if (newId<NhaloP)
+        nodes[newId].baseId = abs(sharedNodes[n].baseId);
+      else
+        nodes[newId].baseId = -abs(sharedNodes[n].baseId);
+    }
+  }
+  for(dlong n=Nhalo;n<N;++n) nodes[n] = sharedNodes[n-Nhalo];
+
+  sort(nodes.ptr(), nodes.ptr()+N,
+       [](const parallelNode_t& a, const parallelNode_t& b) {
+         return a.newId < b.newId; //group by newId (which also groups by abs(baseId))
+       });
+
+  dlong haloBuf_size = Nhalo;
+
+  dlong NhaloExtT = Nhalo;
+  dlong NhaloExtN = Nhalo;
+
+  while (np>1) {
+    int np_half = (np+1)/2;
+    int r_half = np_half + np_offset;
+
+    int is_lo = (rank<r_half) ? 1 : 0;
+
+    int partner = np-1-(rank-np_offset)+np_offset;
+    int Nmsg=1;
+    if (partner==rank) {
+      partner=r_half;
+      Nmsg=0;
+    }
+    if (np&1 && rank==r_half) {
+      Nmsg=2;
+    }
+    levelsN[Nlevels].partner = partner;
+    levelsT[Nlevels].partner = partner;
+    levelsN[Nlevels].Nmsg = Nmsg;
+    levelsT[Nlevels].Nmsg = Nmsg;
+
+    //count lo/hi nodes
+    dlong Nlo=0, Nhi=0;
+    for (dlong n=0;n<N;n++) {
+      if (nodes[n].rank<r_half)
+        Nlo++;
+      else
+        Nhi++;
+    }
+
+    int Nsend=(is_lo) ? Nhi : Nlo;
+
+    comm.Isend(Nsend, partner, rank, request[0]);
+
+    int Nrecv0=0, Nrecv1=0;
+    if (Nmsg>0)
+      comm.Irecv(Nrecv0, partner, partner, request[1]);
+    if (Nmsg==2)
+      comm.Irecv(Nrecv1, r_half-1, r_half-1, request[2]);
+
+    comm.Waitall(Nmsg+1, request);
+
+    int Nrecv = Nrecv0+Nrecv1;
+
+    //make room for the nodes we'll recv
+    if (is_lo) Nlo+=Nrecv;
+    else       Nhi+=Nrecv;
+
+    //split node list in two
+    memory<parallelNode_t> loNodes(Nlo);
+    memory<parallelNode_t> hiNodes(Nhi);
+
+    Nlo=0, Nhi=0;
+    for (dlong n=0;n<N;n++) {
+      if (nodes[n].rank<r_half)
+        loNodes[Nlo++] = nodes[n];
+      else
+        hiNodes[Nhi++] = nodes[n];
+    }
+
+    //free up space
+    nodes.free();
+
+    //point to the buffer we keep after the comms
+    nodes = is_lo ? loNodes : hiNodes;
+    N     = is_lo ? Nlo+Nrecv : Nhi+Nrecv;
+
+    const int offset = is_lo ? Nlo : Nhi;
+    memory<parallelNode_t> sendNodes = is_lo ? hiNodes : loNodes;
+
+    //count how many entries from the halo buffer we're sending
+    int NentriesSendN=0;
+    int NentriesSendT=0;
+    for (dlong n=0;n<Nsend;n++) {
+      if (n==0 || abs(sendNodes[n].baseId)!=abs(sendNodes[n-1].baseId)) {
+        if (sendNodes[n].sign>0) NentriesSendN++;
+        NentriesSendT++;
+      }
+    }
+    levelsN[Nlevels].Nsend = NentriesSendN;
+    levelsT[Nlevels].Nsend = NentriesSendT;
+    levelsN[Nlevels].sendIds.malloc(NentriesSendN);
+    levelsT[Nlevels].sendIds.malloc(NentriesSendT);
+
+    NentriesSendN=0; //reset
+    NentriesSendT=0; //reset
+    for (dlong n=0;n<Nsend;n++) {
+      if (n==0 || abs(sendNodes[n].baseId)!=abs(sendNodes[n-1].baseId)) {
+        if (sendNodes[n].sign>0)
+          levelsN[Nlevels].sendIds[NentriesSendN++] = sendNodes[n].newId;
+
+        levelsT[Nlevels].sendIds[NentriesSendT++] = sendNodes[n].newId;
+      }
+      sendNodes[n].newId = -1; //wipe the newId before sending
+    }
+    levelsT[Nlevels].o_sendIds = platform.malloc(levelsT[Nlevels].sendIds);
+    levelsN[Nlevels].o_sendIds = platform.malloc(levelsN[Nlevels].sendIds);
+
+    //share the entry count with our partner
+    comm.Isend(NentriesSendT, partner, rank, request[0]);
+
+    int NentriesRecvT0=0, NentriesRecvT1=0;
+    if (Nmsg>0)
+      comm.Irecv(NentriesRecvT0, partner, partner, request[1]);
+    if (Nmsg==2)
+      comm.Irecv(NentriesRecvT1, r_half-1, r_half-1, request[2]);
+
+    comm.Waitall(Nmsg+1, request);
+
+    levelsT[Nlevels].Nrecv0 = NentriesRecvT0;
+    levelsT[Nlevels].Nrecv1 = NentriesRecvT1;
+    levelsT[Nlevels].recvOffset = NhaloExtT;
+
+    comm.Isend(NentriesSendN, partner, rank, request[0]);
+
+    int NentriesRecvN0=0, NentriesRecvN1=0;
+    if (Nmsg>0)
+      comm.Irecv(NentriesRecvN0, partner, partner, request[1]);
+    if (Nmsg==2)
+      comm.Irecv(NentriesRecvN1, r_half-1, r_half-1, request[2]);
+
+    comm.Waitall(Nmsg+1, request);
+
+    levelsN[Nlevels].Nrecv0 = NentriesRecvN0;
+    levelsN[Nlevels].Nrecv1 = NentriesRecvN1;
+    levelsN[Nlevels].recvOffset = NhaloExtN;
+
+    //space needed in recv buffer for this level
+    dlong buf_size = NhaloExtT + NentriesRecvT0 + NentriesRecvT1;
+    haloBuf_size = (buf_size > haloBuf_size) ? buf_size : haloBuf_size;
+
+
+    //send half the list to our partner
+    comm.Isend(sendNodes, partner, Nsend, rank, request[0]);
+
+    //recv new nodes from our partner(s)
+    if (Nmsg>0)
+      comm.Irecv(nodes+offset, partner, Nrecv0, partner, request[1]);
+    if (Nmsg==2)
+      comm.Irecv(nodes+offset+Nrecv0, r_half-1, Nrecv1, r_half-1, request[2]);
+
+    comm.Waitall(Nmsg+1, request);
+
+    sendNodes.free();
+
+    //We now have a list of nodes who's destinations are in our half
+    // of the hypercube
+    //We now build the gather into the haloBuffer
+
+
+    //record the current order
+    for (dlong n=0;n<N;n++) nodes[n].localId = n;
+
+    //sort the new node list by baseId to find matches
+    sort(nodes.ptr(), nodes.ptr()+N,
+       [](const parallelNode_t& a, const parallelNode_t& b) {
+         if(abs(a.baseId) < abs(b.baseId)) return true; //group by abs(baseId)
+         if(abs(a.baseId) > abs(b.baseId)) return false;
+
+         return a.newId > b.newId; //positive newIds first
+       });
+
+    //find how many positive ids there will be in the extended halo
+    dlong start = 0;
+    NhaloExtN=0;
+    NhaloExtT=0;
+    for (dlong n=0;n<N;++n) {
+      //for each baseId group
+      if (n==N-1 || (abs(nodes[n].baseId)!=abs(nodes[n+1].baseId))) {
+        dlong end = n+1;
+        const dlong id = nodes[start].newId; //get Id
+
+        //if this id is in the extended halo already,
+        // or if it is a new baseId to arrive, look for
+        // a positive node
+        if (id >= Nhalo || id==-1) {
+          for (dlong i=start;i<end;++i) {
+            if (nodes[i].sign>0) {
+              NhaloExtN++;
+              break;
+            }
+          }
+          NhaloExtT++;
+        }
+        start = end;
+      }
+    }
+
+
+    //make an index map to save the original extended halo ids
+    memory<dlong> indexMap(NhaloExtT);
+
+    //fill newIds of new entries if possible, or give them an index
+    NhaloExtT = Nhalo + NhaloExtN;
+    NhaloExtN = Nhalo;
+    start = 0;
+    for (dlong n=0;n<N;++n) {
+      //for each baseId group
+      if (n==N-1 || (abs(nodes[n].baseId)!=abs(nodes[n+1].baseId))) {
+        dlong end = n+1;
+
+        dlong id = nodes[start].newId; //get Id
+
+        //if this id is in the extended halo already,
+        // or if it is a new baseId to arrive, give it
+        // a new id in the extended halo
+        if (id >= Nhalo || id==-1) {
+          int sign = -2;
+          for (dlong i=start;i<end;++i) {
+            if (nodes[i].sign>0) {
+              sign = nodes[i].sign;
+              break;
+            }
+          }
+
+          if (sign>0)
+            id = NhaloExtN++;
+          else
+            id = NhaloExtT++;
+
+          //save the orignal id
+          indexMap[id-Nhalo] = nodes[start].newId;
+        }
+
+        //write id into this baseId group
+        for (dlong i=start;i<end;++i)
+          nodes[i].newId = id;
+
+        start = end;
+      }
+    }
+
+    //sort back to first ordering
+    permute(N, nodes, [](const parallelNode_t& a) { return a.localId; } );
+
+    ogsOperator_t gatherN(platform);
+    ogsOperator_t gatherT(platform);
+
+    gatherN.kind = Unsigned;
+    gatherT.kind = Unsigned;
+
+    gatherN.NrowsN = NhaloExtN;
+    gatherN.NrowsT = NhaloExtN;
+    gatherN.Ncols  = levelsN[Nlevels].recvOffset
+                      + NentriesRecvN0 + NentriesRecvN1;
+
+    gatherT.NrowsN = NhaloExtT;
+    gatherT.NrowsT = NhaloExtT;
+    gatherT.Ncols  = levelsT[Nlevels].recvOffset
+                      + NentriesRecvT0 + NentriesRecvT1;
+
+    gatherT.rowStartsT.calloc(gatherT.NrowsT+1);
+    gatherT.rowStartsN = gatherT.rowStartsT;
+
+    gatherN.rowStartsT.calloc(gatherT.NrowsT+1);
+    gatherN.rowStartsN = gatherN.rowStartsT;
+
+    //gatherT the existing halo
+    for (dlong n=0;n<Nhalo;++n) gatherT.rowStartsT[n+1]=1;
+
+    //for notrans theres nothing to gather in the negative nodes the first time
+    if (np==size)
+      for (dlong n=0;n<NhaloP;++n) gatherN.rowStartsT[n+1]=1;
+    else
+      for (dlong n=0;n<Nhalo;++n) gatherN.rowStartsT[n+1]=1;
+
+    //look through the nodes we still have for extended halo nodes
+    for (dlong n=0;n<offset;++n) {
+      if (n==0 || abs(nodes[n].baseId)!=abs(nodes[n-1].baseId)) {
+        const dlong id = nodes[n].newId;
+        if (nodes[n].newId >= Nhalo) {
+          if (nodes[n].sign >0) gatherN.rowStartsT[id+1]++;
+          gatherT.rowStartsT[id+1]++;
+        }
+      }
+    }
+
+    //look through first message for nodes to gather
+    for (dlong n=offset;n<offset+Nrecv0;++n) {
+      if (n==offset || abs(nodes[n].baseId)!=abs(nodes[n-1].baseId)) {
+        const dlong id = nodes[n].newId;
+        if (nodes[n].sign >0) gatherN.rowStartsT[id+1]++;
+        gatherT.rowStartsT[id+1]++;
+      }
+    }
+    //look through second message for nodes to gather
+    for (dlong n=offset+Nrecv0;n<N;++n) {
+      if (n==offset+Nrecv0 || abs(nodes[n].baseId)!=abs(nodes[n-1].baseId)) {
+        const dlong id = nodes[n].newId;
+        if (nodes[n].sign >0) gatherN.rowStartsT[id+1]++;
+        gatherT.rowStartsT[id+1]++;
+      }
+    }
+
+    for (dlong i=0;i<gatherT.NrowsT;i++) {
+      gatherT.rowStartsT[i+1] += gatherT.rowStartsT[i];
+      gatherN.rowStartsT[i+1] += gatherN.rowStartsT[i];
+    }
+
+    gatherT.nnzT = gatherT.rowStartsT[gatherT.NrowsT];
+    gatherT.nnzN = gatherT.rowStartsT[gatherT.NrowsT];
+
+    gatherT.colIdsT.calloc(gatherT.nnzT);
+    gatherT.colIdsN = gatherT.colIdsT;
+
+    gatherN.nnzT = gatherN.rowStartsT[gatherN.NrowsT];
+    gatherN.nnzN = gatherN.rowStartsT[gatherN.NrowsT];
+
+    gatherN.colIdsT.calloc(gatherN.nnzT);
+    gatherN.colIdsN = gatherN.colIdsT;
+
+    //gatherT the existing halo
+    for (dlong n=0;n<Nhalo;++n) {
+      gatherT.colIdsT[gatherT.rowStartsT[n]++] = n;
+    }
+
+    if (np==size) {
+      for (dlong n=0;n<NhaloP;++n) {
+        gatherN.colIdsT[gatherN.rowStartsT[n]++] = n;
+      }
+    } else {
+      for (dlong n=0;n<Nhalo;++n) {
+        gatherN.colIdsT[gatherN.rowStartsT[n]++] = n;
+      }
+    }
+
+    //look through the nodes we still have for extended halo nodes
+    for (dlong n=0;n<offset;++n) {
+      if (n==0 || abs(nodes[n].baseId)!=abs(nodes[n-1].baseId)) {
+        const dlong id = nodes[n].newId;
+        if (nodes[n].newId >= Nhalo) {
+          if (nodes[n].sign > 0) {
+            gatherN.colIdsT[gatherN.rowStartsT[id]++] = indexMap[id-Nhalo];
+          }
+          gatherT.colIdsT[gatherT.rowStartsT[id]++] = indexMap[id-Nhalo];
+        }
+      }
+    }
+
+    indexMap.free();
+
+    dlong NentriesRecvN=levelsN[Nlevels].recvOffset;
+    dlong NentriesRecvT=levelsT[Nlevels].recvOffset;
+    //look through first message for nodes to gatherT
+    for (dlong n=offset;n<offset+Nrecv0;++n) {
+      if (n==offset || abs(nodes[n].baseId)!=abs(nodes[n-1].baseId)) {
+        const dlong id = nodes[n].newId;
+        if (nodes[n].sign > 0) {
+          gatherN.colIdsT[gatherN.rowStartsT[id]++] = NentriesRecvN++;
+        }
+        gatherT.colIdsT[gatherT.rowStartsT[id]++] = NentriesRecvT++;
+      }
+    }
+    //look through second message for nodes to gatherT
+    for (dlong n=offset+Nrecv0;n<N;++n) {
+      if (n==offset+Nrecv0 || abs(nodes[n].baseId)!=abs(nodes[n-1].baseId)) {
+        const dlong id = nodes[n].newId;
+        if (nodes[n].sign > 0) {
+          gatherN.colIdsT[gatherN.rowStartsT[id]++] = NentriesRecvN++;
+        }
+        gatherT.colIdsT[gatherT.rowStartsT[id]++] = NentriesRecvT++;
+      }
+    }
+
+    //reset row starts
+    for (dlong i=gatherT.NrowsT;i>0;--i) {
+      gatherT.rowStartsT[i] = gatherT.rowStartsT[i-1];
+      gatherN.rowStartsT[i] = gatherN.rowStartsT[i-1];
+    }
+    gatherT.rowStartsT[0] = 0;
+    gatherN.rowStartsT[0] = 0;
+
+    gatherT.o_rowStartsT = platform.malloc(gatherT.rowStartsT);
+    gatherT.o_rowStartsN = gatherT.o_rowStartsT;
+    gatherN.o_rowStartsT = platform.malloc(gatherN.rowStartsT);
+    gatherN.o_rowStartsN = gatherN.o_rowStartsT;
+    gatherT.o_colIdsT = platform.malloc(gatherT.colIdsT);
+    gatherT.o_colIdsN = gatherT.o_colIdsT;
+    gatherN.o_colIdsT = platform.malloc(gatherN.colIdsT);
+    gatherN.o_colIdsN = gatherN.o_colIdsT;
+
+    gatherN.setupRowBlocks();
+    gatherT.setupRowBlocks();
+
+    levelsT[Nlevels].gather = gatherT;
+    levelsN[Nlevels].gather = gatherN;
+
+    //sort the new node list by newId
+    sort(nodes.ptr(), nodes.ptr()+N,
+       [](const parallelNode_t& a, const parallelNode_t& b) {
+         return a.newId < b.newId; //group by newId (which also groups by abs(baseId))
+       });
+
+    //propagate the sign of recvieved nodes
+    start = 0;
+    for (dlong n=0;n<N;++n) {
+      //for each baseId group
+      if (n==N-1 || (abs(nodes[n].baseId)!=abs(nodes[n+1].baseId))) {
+        dlong end = n+1;
+        //look for a positive sign, so we know if this node flips positive
+        for (dlong i=start;i<end;++i) {
+          const int sign = nodes[i].sign;
+          if (sign>0) {
+            for (dlong j=start;j<end;++j)
+              nodes[j].sign = sign;
+            break;
+          }
+        }
+        start = end;
+      }
+    }
+
+    //Shrink the size of the hypercube
+    if (is_lo) {
+      np = np_half;
+    } else {
+      np -= np_half;
+      np_offset = r_half;
+    }
+    Nlevels++;
+  }
+  if (size>1) nodes.free();
+
+  NsendMax=0, NrecvMax=0;
+  for (int k=0;k<Nlevels;k++) {
+    int Nsend = levelsT[k].Nsend;
+    NsendMax = (Nsend>NsendMax) ? Nsend : NsendMax;
+    int Nrecv = levelsT[k].recvOffset
+                + levelsT[k].Nrecv0 + levelsT[k].Nrecv1;
+    NrecvMax = (Nrecv>NrecvMax) ? Nrecv : NrecvMax;
+  }
+
+  //make scratch space
+  AllocBuffer(sizeof(dfloat));
+}
+
+void ogsCrystalRouter_t::AllocBuffer(size_t Nbytes) {
+
+  if (o_sendspace.size() < NsendMax*Nbytes) {
+    h_sendspace = platform.hostMalloc<char>(NsendMax*Nbytes);
+    o_sendspace = platform.malloc<char>(NsendMax*Nbytes);
+  }
+  if (o_work[0].size() < NrecvMax*Nbytes) {
+    h_work[0] = platform.hostMalloc<char>(NrecvMax*Nbytes);
+    h_work[1] = platform.hostMalloc<char>(NrecvMax*Nbytes);
+    h_workspace = h_work[0];
+    hbuf_id=0;
+
+    o_work[0] = platform.malloc<char>(NrecvMax*Nbytes);
+    o_work[1] = platform.malloc<char>(NrecvMax*Nbytes);
+    o_workspace = o_work[0];
+    buf_id=0;
+  }
+}
+
+} //namespace ogs
+
+} //namespace libp
diff --git a/libs/ogs/ogsHalo.cpp b/libs/ogs/ogsHalo.cpp
new file mode 100644
index 000000000..7e9743531
--- /dev/null
+++ b/libs/ogs/ogsHalo.cpp
@@ -0,0 +1,395 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "ogs.hpp"
+#include "ogs/ogsUtils.hpp"
+#include "ogs/ogsOperator.hpp"
+#include "ogs/ogsExchange.hpp"
+
+namespace libp {
+
+namespace ogs {
+
+/********************************
+ * Device Exchange
+ ********************************/
+template<typename T>
+void halo_t::Exchange(deviceMemory<T> o_v, const int k) {
+  ExchangeStart (o_v, k);
+  ExchangeFinish(o_v, k);
+}
+
+template<typename T>
+void halo_t::ExchangeStart(deviceMemory<T> o_v, const int k){
+  exchange->AllocBuffer(k*sizeof(T));
+
+  deviceMemory<T> o_haloBuf = exchange->o_workspace;
+
+  if (exchange->gpu_aware) {
+    if (gathered_halo) {
+      //if this halo was build from a gathered ogs the halo nodes are at the end
+      o_haloBuf.copyFrom(o_v + k*NlocalT, k*NhaloP,
+                         0, properties_t("async", true));
+    } else {
+      //collect halo buffer
+      gatherHalo->Gather(o_haloBuf, o_v, k, Add, NoTrans);
+    }
+
+    //prepare MPI exchange
+    exchange->Start(o_haloBuf, k, Add, NoTrans);
+
+  } else {
+    //get current stream
+    device_t &device = platform.device;
+    stream_t currentStream = device.getStream();
+
+    //if not using gpu-aware mpi move the halo buffer to the host
+    pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+    if (gathered_halo) {
+      //wait for o_v to be ready
+      device.finish();
+
+      //queue copy to host
+      device.setStream(dataStream);
+      haloBuf.copyFrom(o_v + k*NlocalT, NhaloP*k,
+                       0, properties_t("async", true));
+      device.setStream(currentStream);
+    } else {
+      //collect halo buffer
+      gatherHalo->Gather(o_haloBuf, o_v, k, Add, NoTrans);
+
+      //wait for o_haloBuf to be ready
+      device.finish();
+
+      //queue copy to host
+      device.setStream(dataStream);
+      haloBuf.copyFrom(o_haloBuf, NhaloP*k,
+                       0, properties_t("async", true));
+      device.setStream(currentStream);
+    }
+  }
+}
+
+template<typename T>
+void halo_t::ExchangeFinish(deviceMemory<T> o_v, const int k){
+
+  deviceMemory<T> o_haloBuf = exchange->o_workspace;
+
+  //write exchanged halo buffer back to vector
+  if (exchange->gpu_aware) {
+    //finish MPI exchange
+    exchange->Finish(o_haloBuf, k, Add, NoTrans);
+
+    if (gathered_halo) {
+      o_haloBuf.copyTo(o_v + k*(NlocalT+NhaloP), k*Nhalo,
+                       k*NhaloP, properties_t("async", true));
+    } else {
+      gatherHalo->Scatter(o_v, o_haloBuf, k, NoTrans);
+    }
+  } else {
+    pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+    //get current stream
+    device_t &device = platform.device;
+    stream_t currentStream = device.getStream();
+
+    //synchronize data stream to ensure the buffer is on the host
+    device.setStream(dataStream);
+    device.finish();
+
+    /*MPI exchange of host buffer*/
+    exchange->Start (haloBuf, k, Add, NoTrans);
+    exchange->Finish(haloBuf, k, Add, NoTrans);
+
+    // copy recv back to device
+    if (gathered_halo) {
+      haloBuf.copyTo(o_v + k*(NlocalT+NhaloP), k*Nhalo,
+                     k*NhaloP, properties_t("async", true));
+      device.finish(); //wait for transfer to finish
+      device.setStream(currentStream);
+    } else {
+      haloBuf.copyTo(o_haloBuf+k*NhaloP, k*Nhalo,
+                     k*NhaloP, properties_t("async", true));
+      device.finish(); //wait for transfer to finish
+      device.setStream(currentStream);
+
+      gatherHalo->Scatter(o_v, o_haloBuf, k, NoTrans);
+    }
+  }
+}
+
+template void halo_t::ExchangeStart(deviceMemory<float> o_v, const int k);
+template void halo_t::ExchangeStart(deviceMemory<double> o_v, const int k);
+template void halo_t::ExchangeStart(deviceMemory<int> o_v, const int k);
+template void halo_t::ExchangeStart(deviceMemory<long long int> o_v, const int k);
+template void halo_t::ExchangeFinish(deviceMemory<float> o_v, const int k);
+template void halo_t::ExchangeFinish(deviceMemory<double> o_v, const int k);
+template void halo_t::ExchangeFinish(deviceMemory<int> o_v, const int k);
+template void halo_t::ExchangeFinish(deviceMemory<long long int> o_v, const int k);
+template void halo_t::Exchange(deviceMemory<float> o_v, const int k);
+template void halo_t::Exchange(deviceMemory<double> o_v, const int k);
+template void halo_t::Exchange(deviceMemory<int> o_v, const int k);
+template void halo_t::Exchange(deviceMemory<long long int> o_v, const int k);
+
+//host version
+template<typename T>
+void halo_t::Exchange(memory<T> v, const int k) {
+  ExchangeStart (v, k);
+  ExchangeFinish(v, k);
+}
+
+template<typename T>
+void halo_t::ExchangeStart(memory<T> v, const int k) {
+  exchange->AllocBuffer(k*sizeof(T));
+
+  pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+  //collect halo buffer
+  if (gathered_halo) {
+    //if this halo was build from a gathered ogs the halo nodes are at the end
+    haloBuf.copyFrom(v + k*NlocalT, k*NhaloP);
+  } else {
+    gatherHalo->Gather(haloBuf, v, k, Add, NoTrans);
+  }
+
+  //Prepare MPI exchange
+  exchange->Start(haloBuf, k, Add, NoTrans);
+}
+
+template<typename T>
+void halo_t::ExchangeFinish(memory<T> v, const int k) {
+
+  pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+  //finish MPI exchange
+  exchange->Finish(haloBuf, k, Add, NoTrans);
+
+  //write exchanged halo buffer back to vector
+  if (gathered_halo) {
+    //if this halo was build from a gathered ogs the halo nodes are at the end
+    haloBuf.copyTo(v + k*(NlocalT+NhaloP),
+                   k*Nhalo,
+                   k*NhaloP);
+  } else {
+    gatherHalo->Scatter(v, haloBuf, k, NoTrans);
+  }
+}
+
+template void halo_t::ExchangeStart(memory<float> v, const int k);
+template void halo_t::ExchangeStart(memory<double> v, const int k);
+template void halo_t::ExchangeStart(memory<int> v, const int k);
+template void halo_t::ExchangeStart(memory<long long int> v, const int k);
+template void halo_t::ExchangeFinish(memory<float> v, const int k);
+template void halo_t::ExchangeFinish(memory<double> v, const int k);
+template void halo_t::ExchangeFinish(memory<int> v, const int k);
+template void halo_t::ExchangeFinish(memory<long long int> v, const int k);
+template void halo_t::Exchange(memory<float> v, const int k);
+template void halo_t::Exchange(memory<double> v, const int k);
+template void halo_t::Exchange(memory<int> v, const int k);
+template void halo_t::Exchange(memory<long long int> v, const int k);
+
+/********************************
+ * Combine
+ ********************************/
+template<typename T>
+void halo_t::Combine(deviceMemory<T> o_v, const int k) {
+  CombineStart (o_v, k);
+  CombineFinish(o_v, k);
+}
+
+template<typename T>
+void halo_t::CombineStart(deviceMemory<T> o_v, const int k){
+  exchange->AllocBuffer(k*sizeof(T));
+
+  deviceMemory<T> o_haloBuf = exchange->o_workspace;
+
+  if (exchange->gpu_aware) {
+    if (gathered_halo) {
+      //if this halo was build from a gathered ogs the halo nodes are at the end
+      o_haloBuf.copyFrom(o_v + k*NlocalT, k*NhaloT,
+                         0, properties_t("async", true));
+    } else {
+      //collect halo buffer
+      gatherHalo->Gather(o_haloBuf, o_v, k, Add, Trans);
+    }
+
+    //prepare MPI exchange
+    exchange->Start(o_haloBuf, k, Add, Trans);
+  } else {
+    //get current stream
+    device_t &device = platform.device;
+    stream_t currentStream = device.getStream();
+
+    //if not using gpu-aware mpi move the halo buffer to the host
+    pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+    if (gathered_halo) {
+      //wait for o_v to be ready
+      device.finish();
+
+      //queue copy to host
+      device.setStream(dataStream);
+      haloBuf.copyFrom(o_v + k*NlocalT, NhaloT*k,
+                       0, properties_t("async", true));
+      device.setStream(currentStream);
+    } else {
+      //collect halo buffer
+      gatherHalo->Gather(o_haloBuf, o_v, k, Add, Trans);
+
+      //wait for o_haloBuf to be ready
+      device.finish();
+
+      //queue copy to host
+      device.setStream(dataStream);
+      haloBuf.copyFrom(o_haloBuf, NhaloT*k,
+                       0, properties_t("async", true));
+      device.setStream(currentStream);
+    }
+  }
+}
+
+template<typename T>
+void halo_t::CombineFinish(deviceMemory<T> o_v, const int k){
+
+  deviceMemory<T> o_haloBuf = exchange->o_workspace;
+
+  //write exchanged halo buffer back to vector
+  if (exchange->gpu_aware) {
+    //finish MPI exchange
+    exchange->Finish(o_haloBuf, k, Add, Trans);
+
+    if (gathered_halo) {
+      //if this halo was build from a gathered ogs the halo nodes are at the end
+      o_haloBuf.copyTo(o_v + k*NlocalT, k*NhaloP,
+                       0, properties_t("async", true));
+    } else {
+      gatherHalo->Scatter(o_v, o_haloBuf, k, Trans);
+    }
+  } else {
+    pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+    //get current stream
+    device_t &device = platform.device;
+    stream_t currentStream = device.getStream();
+
+    //synchronize data stream to ensure the buffer is on the host
+    device.setStream(dataStream);
+    device.finish();
+
+    /*MPI exchange of host buffer*/
+    exchange->Start (haloBuf, k, Add, Trans);
+    exchange->Finish(haloBuf, k, Add, Trans);
+
+    if (gathered_halo) {
+      // copy recv back to device
+      haloBuf.copyTo(o_v + k*NlocalT, NhaloP*k,
+                     0, properties_t("async", true));
+      device.finish(); //wait for transfer to finish
+      device.setStream(currentStream);
+    } else {
+      haloBuf.copyTo(o_haloBuf, NhaloP*k,
+                     0, properties_t("async", true));
+      device.finish(); //wait for transfer to finish
+      device.setStream(currentStream);
+
+      gatherHalo->Scatter(o_v, o_haloBuf, k, Trans);
+    }
+  }
+}
+
+template void halo_t::CombineStart(deviceMemory<float> o_v, const int k);
+template void halo_t::CombineStart(deviceMemory<double> o_v, const int k);
+template void halo_t::CombineStart(deviceMemory<int> o_v, const int k);
+template void halo_t::CombineStart(deviceMemory<long long int> o_v, const int k);
+template void halo_t::CombineFinish(deviceMemory<float> o_v, const int k);
+template void halo_t::CombineFinish(deviceMemory<double> o_v, const int k);
+template void halo_t::CombineFinish(deviceMemory<int> o_v, const int k);
+template void halo_t::CombineFinish(deviceMemory<long long int> o_v, const int k);
+template void halo_t::Combine(deviceMemory<float> o_v, const int k);
+template void halo_t::Combine(deviceMemory<double> o_v, const int k);
+template void halo_t::Combine(deviceMemory<int> o_v, const int k);
+template void halo_t::Combine(deviceMemory<long long int> o_v, const int k);
+
+//host version
+template<typename T>
+void halo_t::Combine(memory<T> v, const int k) {
+  CombineStart (v, k);
+  CombineFinish(v, k);
+}
+
+template<typename T>
+void halo_t::CombineStart(memory<T> v, const int k) {
+  exchange->AllocBuffer(k*sizeof(T));
+
+  pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+  //collect halo buffer
+  if (gathered_halo) {
+    //if this halo was build from a gathered ogs the halo nodes are at the end
+    haloBuf.copyFrom(v + k*NlocalT, k*NhaloT);
+  } else {
+    gatherHalo->Gather(haloBuf, v, k, Add, Trans);
+  }
+
+  //Prepare MPI exchange
+  exchange->Start(haloBuf, k, Add, Trans);
+}
+
+
+template<typename T>
+void halo_t::CombineFinish(memory<T> v, const int k) {
+
+  pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+  //finish MPI exchange
+  exchange->Finish(haloBuf, k, Add, Trans);
+
+  //write exchanged halo buffer back to vector
+  if (gathered_halo) {
+    //if this halo was build from a gathered ogs the halo nodes are at the end
+    haloBuf.copyTo(v + k*NlocalT, k*NhaloP);
+  } else {
+    gatherHalo->Scatter(v, haloBuf, k, Trans);
+  }
+}
+
+template void halo_t::CombineStart(memory<float> v, const int k);
+template void halo_t::CombineStart(memory<double> v, const int k);
+template void halo_t::CombineStart(memory<int> v, const int k);
+template void halo_t::CombineStart(memory<long long int> v, const int k);
+template void halo_t::CombineFinish(memory<float> v, const int k);
+template void halo_t::CombineFinish(memory<double> v, const int k);
+template void halo_t::CombineFinish(memory<int> v, const int k);
+template void halo_t::CombineFinish(memory<long long int> v, const int k);
+template void halo_t::Combine(memory<float> v, const int k);
+template void halo_t::Combine(memory<double> v, const int k);
+template void halo_t::Combine(memory<int> v, const int k);
+template void halo_t::Combine(memory<long long int> v, const int k);
+
+} //namespace ogs
+
+} //namespace libp
diff --git a/libs/ogs/ogsKernels.cpp b/libs/ogs/ogsKernels.cpp
deleted file mode 100644
index de33ecd45..000000000
--- a/libs/ogs/ogsKernels.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-
-#include "ogs.hpp"
-#include "ogs/ogsKernels.hpp"
-
-//convert a macro command into a string
-#define _STR(x) #x
-#define STR(x) _STR(x)
-
-namespace ogs {
-
-  //NC: Hard code these for now. Should be sufficient for GPU devices, but needs attention for CPU
-  const int blockSize = 256;
-  const int gatherNodesPerBlock = 1024; //should be a multiple of blockSize for good unrolling
-
-  int Nrefs = 0;
-
-  occa::stream dataStream;
-
-#define DEFINE_GATHERSCATTER_KERNEL(T,OP) \
-  occa::kernel gatherScatterKernel_##T##_##OP;
-
-#define DEFINE_GATHER_KERNEL(T,OP) \
-  occa::kernel gatherKernel_##T##_##OP;
-
-#define DEFINE_SCATTER_KERNEL(T) \
-  occa::kernel scatterKernel_##T;
-
-#define DEFINE_KERNELS(T)                        \
-  OGS_FOR_EACH_OP(T,DEFINE_GATHERSCATTER_KERNEL) \
-  OGS_FOR_EACH_OP(T,DEFINE_GATHER_KERNEL)        \
-  DEFINE_SCATTER_KERNEL(T)
-
-OGS_FOR_EACH_TYPE(DEFINE_KERNELS)
-
-
-void initKernels(platform_t& platform) {
-
-  int rank = platform.rank;
-
-  dataStream = platform.device.createStream();
-
-  occa::properties kernelInfo = platform.props;
-
-  kernelInfo["defines/p_blockSize"] = blockSize;
-  kernelInfo["defines/p_gatherNodesPerBlock"] = gatherNodesPerBlock;
-
-#define DEFINE_OCCA_ADD_INIT(T) \
-  kernelInfo["defines/init_" STR(T) "_add"] = (T)  0;                              \
-  kernelInfo["defines/init_" STR(T) "_mul"] = (T)  1;                              \
-  kernelInfo["defines/init_" STR(T) "_min"] = (T)  std::numeric_limits<T>::max(); \
-  kernelInfo["defines/init_" STR(T) "_max"] = (T) -std::numeric_limits<T>::max();
-
-//OCCA properties don't have an operator+ for long long int, so alias it to int64_t
-typedef int64_t long_long;
-  OGS_FOR_EACH_TYPE(DEFINE_OCCA_ADD_INIT)
-
-  kernelInfo["includes"] += LIBP_DIR "/include/ogs/ogsDefs.h";
-
-  if (rank==0) {printf("Compiling GatherScatter Kernels...");fflush(stdout);}
-
-#define DEFINE_GATHERSCATTER_BUILD(T,OP)                                           \
-  gatherScatterKernel_##T##_##OP = platform.buildKernel(OGS_DIR "/okl/gatherScatter.okl",\
-                                             "gatherScatter_" STR(T) "_" STR(OP),  \
-                                             kernelInfo);                          \
-
-#define DEFINE_GATHER_BUILD(T,OP)                                                  \
-  gatherKernel_##T##_##OP = platform.buildKernel(OGS_DIR "/okl/gatherScatter.okl",    \
-                                             "gather_" STR(T) "_" STR(OP),         \
-                                             kernelInfo);                          \
-
-#define DEFINE_SCATTER_BUILD(T)                                                    \
-  scatterKernel_##T      = platform.buildKernel(OGS_DIR "/okl/gatherScatter.okl",     \
-                                             "scatter_" STR(T),                    \
-                                             kernelInfo);                          \
-
-#define DEFINE_BUILD(T)                         \
-  OGS_FOR_EACH_OP(T,DEFINE_GATHERSCATTER_BUILD) \
-  OGS_FOR_EACH_OP(T,DEFINE_GATHER_BUILD)        \
-  DEFINE_SCATTER_BUILD(T)
-
-  OGS_FOR_EACH_TYPE(DEFINE_BUILD)
-
-  if(rank==0) printf("done.\n");
-}
-
-void freeKernels() {
-
-#define DEFINE_GATHERSCATTER_FREE(T,OP)      \
-  gatherScatterKernel_##T##_##OP.free();
-
-#define DEFINE_GATHER_FREE(T,OP)      \
-  gatherKernel_##T##_##OP.free();
-
-#define DEFINE_SCATTER_FREE(T)       \
-  scatterKernel_##T.free();
-
-#define DEFINE_FREE(T)                         \
-  OGS_FOR_EACH_OP(T,DEFINE_GATHERSCATTER_FREE) \
-  OGS_FOR_EACH_OP(T,DEFINE_GATHER_FREE)        \
-  DEFINE_SCATTER_FREE(T)
-
-  OGS_FOR_EACH_TYPE(DEFINE_FREE)
-}
-
-} //namespace ogs
-
diff --git a/libs/ogs/ogsOperator.cpp b/libs/ogs/ogsOperator.cpp
new file mode 100644
index 000000000..19a496060
--- /dev/null
+++ b/libs/ogs/ogsOperator.cpp
@@ -0,0 +1,635 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include <limits>
+#include "ogs.hpp"
+#include "ogs/ogsUtils.hpp"
+#include "ogs/ogsOperator.hpp"
+
+namespace libp {
+
+namespace ogs {
+
+template<typename T>
+struct Op_Add {
+  inline const T init() const { return T{0}; }
+  inline void operator()(T& gv, const T v) const { gv += v; }
+};
+template<typename T>
+struct Op_Mul {
+  inline const T init() const { return T{1}; }
+  inline void operator()(T& gv, const T v) const { gv *= v; }
+};
+template<typename T>
+struct Op_Max {
+  inline const T init() const { return -std::numeric_limits<T>::max(); }
+  inline void operator()(T& gv, const T v) const { gv = (v>gv) ? v : gv; }
+};
+template<typename T>
+struct Op_Min {
+  inline const T init() const {return  std::numeric_limits<T>::max(); }
+  inline void operator()(T& gv, const T v) const { gv = (v<gv) ? v : gv; }
+};
+
+/********************************
+ * Gather Operation
+ ********************************/
+template <template<typename> class U,
+          template<typename> class V,
+          template<typename> class Op,
+          typename T>
+void ogsOperator_t::Gather(U<T> gv,
+                           const V<T> v,
+                           const int K,
+                           const Transpose trans) {
+
+  dlong Nrows;
+  dlong *__restrict__ rowStarts, *__restrict__ colIds;
+  if (trans==NoTrans) {
+    Nrows = NrowsN;
+    rowStarts = rowStartsN.ptr();
+    colIds = colIdsN.ptr();
+  } else {
+    Nrows = NrowsT;
+    rowStarts = rowStartsT.ptr();
+    colIds = colIdsT.ptr();
+  }
+
+  const T*__restrict__ v_ptr  = v.ptr();
+  T*__restrict__ gv_ptr = gv.ptr();
+
+  const Op<T> op;
+
+  if (K==1) {
+    #pragma omp parallel for
+    for(dlong n=0;n<Nrows;++n){
+      const dlong start = rowStarts[n];
+      const dlong end   = rowStarts[n+1];
+
+      T val = op.init();
+      for(dlong g=start;g<end;++g){
+        op(val, v_ptr[colIds[g]]);
+      }
+      gv_ptr[n] = val;
+    }
+  } else {
+    #pragma omp parallel for
+    for(dlong n=0;n<Nrows;++n){
+      const dlong start = rowStarts[n];
+      const dlong end   = rowStarts[n+1];
+
+      for (int k=0;k<K;++k) {
+        T val = op.init();
+        for(dlong g=start;g<end;++g){
+          op(val, v_ptr[k+colIds[g]*K]);
+        }
+        gv_ptr[k+n*K] = val;
+      }
+    }
+  }
+}
+
+template <template<typename> class U,
+          template<typename> class V,
+          typename T>
+void ogsOperator_t::Gather(U<T> gv,
+                           const V<T> v,
+                           const int k,
+                           const Op op,
+                           const Transpose trans) {
+  switch (op){
+    case Add:
+      Gather<U, V, Op_Add, T>(gv, v, k, trans); break;
+    case Mul:
+      Gather<U, V, Op_Mul, T>(gv, v, k, trans); break;
+    case Max:
+      Gather<U, V, Op_Max, T>(gv, v, k, trans); break;
+    case Min:
+      Gather<U, V, Op_Min, T>(gv, v, k, trans); break;
+  }
+}
+
+template
+void ogsOperator_t::Gather(memory<float> gv, const memory<float> v,
+                           const int k, const Op op, const Transpose trans);
+template
+void ogsOperator_t::Gather(memory<double> gv, const memory<double> v,
+                           const int k, const Op op, const Transpose trans);
+template
+void ogsOperator_t::Gather(memory<int> gv, const memory<int> v,
+                           const int k, const Op op, const Transpose trans);
+template
+void ogsOperator_t::Gather(memory<long long int> gv, const memory<long long int> v,
+                           const int k, const Op op, const Transpose trans);
+
+template
+void ogsOperator_t::Gather(pinnedMemory<float> gv, const memory<float> v,
+                           const int k, const Op op, const Transpose trans);
+template
+void ogsOperator_t::Gather(pinnedMemory<double> gv, const memory<double> v,
+                           const int k, const Op op, const Transpose trans);
+template
+void ogsOperator_t::Gather(pinnedMemory<int> gv, const memory<int> v,
+                           const int k, const Op op, const Transpose trans);
+template
+void ogsOperator_t::Gather(pinnedMemory<long long int> gv, const memory<long long int> v,
+                           const int k, const Op op, const Transpose trans);
+
+template
+void ogsOperator_t::Gather(pinnedMemory<float> gv, const pinnedMemory<float> v,
+                           const int k, const Op op, const Transpose trans);
+template
+void ogsOperator_t::Gather(pinnedMemory<double> gv, const pinnedMemory<double> v,
+                           const int k, const Op op, const Transpose trans);
+template
+void ogsOperator_t::Gather(pinnedMemory<int> gv, const pinnedMemory<int> v,
+                           const int k, const Op op, const Transpose trans);
+template
+void ogsOperator_t::Gather(pinnedMemory<long long int> gv, const pinnedMemory<long long int> v,
+                           const int k, const Op op, const Transpose trans);
+
+
+template<typename T>
+void ogsOperator_t::Gather(deviceMemory<T> o_gv,
+                           deviceMemory<T> o_v,
+                           const int k,
+                           const Op op,
+                           const Transpose trans) {
+  constexpr Type type = ogsType<T>::get();
+  InitializeKernels(platform, type, op);
+
+  if (trans==NoTrans) {
+    if (NrowBlocksN)
+      gatherKernel[type][op](NrowBlocksN,
+                              k,
+                              o_blockRowStartsN,
+                              o_rowStartsN,
+                              o_colIdsN,
+                              o_v,
+                              o_gv);
+  } else {
+    if (NrowBlocksT)
+      gatherKernel[type][op](NrowBlocksT,
+                              k,
+                              o_blockRowStartsT,
+                              o_rowStartsT,
+                              o_colIdsT,
+                              o_v,
+                              o_gv);
+  }
+}
+
+template
+void ogsOperator_t::Gather(deviceMemory<float> gv, const deviceMemory<float> v,
+                           const int k, const Op op, const Transpose trans);
+template
+void ogsOperator_t::Gather(deviceMemory<double> gv, const deviceMemory<double> v,
+                           const int k, const Op op, const Transpose trans);
+template
+void ogsOperator_t::Gather(deviceMemory<int> gv, const deviceMemory<int> v,
+                           const int k, const Op op, const Transpose trans);
+template
+void ogsOperator_t::Gather(deviceMemory<long long int> gv, const deviceMemory<long long int> v,
+                           const int k, const Op op, const Transpose trans);
+
+
+/********************************
+ * Scatter Operation
+ ********************************/
+template <template<typename> class U,
+          template<typename> class V,
+          typename T>
+void ogsOperator_t::Scatter(U<T> v, const V<T> gv,
+                            const int K, const Transpose trans) {
+
+  dlong Nrows;
+  dlong *__restrict__ rowStarts, *__restrict__ colIds;
+  if (trans==Trans) {
+    Nrows = NrowsN;
+    rowStarts = rowStartsN.ptr();
+    colIds = colIdsN.ptr();
+  } else {
+    Nrows = NrowsT;
+    rowStarts = rowStartsT.ptr();
+    colIds = colIdsT.ptr();
+  }
+
+  T*__restrict__ v_ptr  = v.ptr();
+  const T*__restrict__ gv_ptr = gv.ptr();
+
+  if (K==1) {
+    #pragma omp parallel for
+    for(dlong n=0;n<Nrows;++n){
+      const dlong start = rowStarts[n];
+      const dlong end   = rowStarts[n+1];
+
+      for(dlong g=start;g<end;++g){
+        v_ptr[colIds[g]] = gv_ptr[n];
+      }
+    }
+  } else {
+    #pragma omp parallel for
+    for(dlong n=0;n<Nrows;++n){
+      const dlong start = rowStarts[n];
+      const dlong end   = rowStarts[n+1];
+
+      for(dlong g=start;g<end;++g){
+        for (int k=0;k<K;++k) {
+          v_ptr[k+colIds[g]*K] = gv_ptr[k+n*K];
+        }
+      }
+    }
+  }
+}
+
+template
+void ogsOperator_t::Scatter(memory<float> v, const memory<float> gv,
+                            const int K, const Transpose trans);
+template
+void ogsOperator_t::Scatter(memory<double> v, const memory<double> gv,
+                            const int K, const Transpose trans);
+template
+void ogsOperator_t::Scatter(memory<int> v, const memory<int> gv,
+                            const int K, const Transpose trans);
+template
+void ogsOperator_t::Scatter(memory<long long int> v, const memory<long long int> gv,
+                            const int K, const Transpose trans);
+
+template
+void ogsOperator_t::Scatter(memory<float> v, const pinnedMemory<float> gv,
+                            const int K, const Transpose trans);
+template
+void ogsOperator_t::Scatter(memory<double> v, const pinnedMemory<double> gv,
+                            const int K, const Transpose trans);
+template
+void ogsOperator_t::Scatter(memory<int> v, const pinnedMemory<int> gv,
+                            const int K, const Transpose trans);
+template
+void ogsOperator_t::Scatter(memory<long long int> v, const pinnedMemory<long long int> gv,
+                            const int K, const Transpose trans);
+
+template<typename T>
+void ogsOperator_t::Scatter(deviceMemory<T> o_v,
+                            deviceMemory<T> o_gv,
+                            const int k,
+                            const Transpose trans) {
+  constexpr Type type = ogsType<T>::get();
+  InitializeKernels(platform, type, Add);
+
+  if (trans==Trans) {
+    if (NrowBlocksN)
+      scatterKernel[type](NrowBlocksN,
+                          k,
+                          o_blockRowStartsN,
+                          o_rowStartsN,
+                          o_colIdsN,
+                          o_gv,
+                          o_v);
+  } else {
+    if (NrowBlocksT)
+      scatterKernel[type](NrowBlocksT,
+                          k,
+                          o_blockRowStartsT,
+                          o_rowStartsT,
+                          o_colIdsT,
+                          o_gv,
+                          o_v);
+  }
+}
+
+template
+void ogsOperator_t::Scatter(deviceMemory<float> v, const deviceMemory<float> gv,
+                            const int K, const Transpose trans);
+template
+void ogsOperator_t::Scatter(deviceMemory<double> v, const deviceMemory<double> gv,
+                            const int K, const Transpose trans);
+template
+void ogsOperator_t::Scatter(deviceMemory<int> v, const deviceMemory<int> gv,
+                            const int K, const Transpose trans);
+template
+void ogsOperator_t::Scatter(deviceMemory<long long int> v, const deviceMemory<long long int> gv,
+                            const int K, const Transpose trans);
+
+/********************************
+ * GatherScatter Operation
+ ********************************/
+template <template<typename> class U,
+          template<typename> class Op,
+          typename T>
+void ogsOperator_t::GatherScatter(U<T> v, const int K,
+                                  const Transpose trans) {
+
+  dlong Nrows;
+  dlong *__restrict__ gRowStarts, *__restrict__ gColIds;
+  dlong *__restrict__ sRowStarts, *__restrict__ sColIds;
+
+  if (trans==Trans) {
+    Nrows = NrowsN;
+    gRowStarts = rowStartsT.ptr();
+    gColIds    = colIdsT.ptr();
+    sRowStarts = rowStartsN.ptr();
+    sColIds    = colIdsN.ptr();
+  } else if (trans==Sym) {
+    Nrows = NrowsT;
+    gRowStarts = rowStartsT.ptr();
+    gColIds    = colIdsT.ptr();
+    sRowStarts = rowStartsT.ptr();
+    sColIds    = colIdsT.ptr();
+  } else {
+    Nrows = NrowsT;
+    gRowStarts = rowStartsN.ptr();
+    gColIds    = colIdsN.ptr();
+    sRowStarts = rowStartsT.ptr();
+    sColIds    = colIdsT.ptr();
+  }
+
+  T*__restrict__ v_ptr = v.ptr();
+
+  const Op<T> op;
+
+  if (K==1) {
+    #pragma omp parallel for
+    for(dlong n=0;n<Nrows;++n){
+      const dlong gstart = gRowStarts[n];
+      const dlong gend   = gRowStarts[n+1];
+      const dlong sstart = sRowStarts[n];
+      const dlong send   = sRowStarts[n+1];
+
+      T val = op.init();
+      for(dlong g=gstart;g<gend;++g){
+        op(val, v_ptr[gColIds[g]]);
+      }
+      for(dlong s=sstart;s<send;++s){
+        v_ptr[sColIds[s]] = val;
+      }
+    }
+  } else {
+    #pragma omp parallel for
+    for(dlong n=0;n<Nrows;++n){
+      const dlong gstart = gRowStarts[n];
+      const dlong gend   = gRowStarts[n+1];
+      const dlong sstart = sRowStarts[n];
+      const dlong send   = sRowStarts[n+1];
+
+      for (int k=0;k<K;++k) {
+        T val = op.init();
+        for(dlong g=gstart;g<gend;++g){
+          op(val, v_ptr[k+gColIds[g]*K]);
+        }
+        for(dlong s=sstart;s<send;++s){
+          v_ptr[k+sColIds[s]*K] = val;
+        }
+      }
+    }
+  }
+}
+
+template <template<typename> class U,
+          typename T>
+void ogsOperator_t::GatherScatter(U<T> v,
+                                  const int k,
+                                  const Op op,
+                                  const Transpose trans) {
+  switch (op){
+    case Add:
+      GatherScatter<U, Op_Add, T>(v, k, trans); break;
+    case Mul:
+      GatherScatter<U, Op_Mul, T>(v, k, trans); break;
+    case Max:
+      GatherScatter<U, Op_Max, T>(v, k, trans); break;
+    case Min:
+      GatherScatter<U, Op_Min, T>(v, k, trans); break;
+  }
+}
+
+template
+void ogsOperator_t::GatherScatter(memory<float> v,const int k,
+                                  const Op op, const Transpose trans);
+template
+void ogsOperator_t::GatherScatter(memory<double> v,const int k,
+                                  const Op op, const Transpose trans);
+template
+void ogsOperator_t::GatherScatter(memory<int> v,const int k,
+                                  const Op op, const Transpose trans);
+template
+void ogsOperator_t::GatherScatter(memory<long long int> v,const int k,
+                                  const Op op, const Transpose trans);
+
+template<typename T>
+void ogsOperator_t::GatherScatter(deviceMemory<T> o_v,
+                                  const int k,
+                                  const Op op,
+                                  const Transpose trans) {
+  constexpr Type type = ogsType<T>::get();
+  InitializeKernels(platform, type, Add);
+
+  if (trans==Trans) {
+    if (NrowBlocksT)
+      gatherScatterKernel[type][Add](NrowBlocksT,
+                                     k,
+                                     o_blockRowStartsT,
+                                     o_rowStartsT,
+                                     o_colIdsT,
+                                     o_rowStartsN,
+                                     o_colIdsN,
+                                     o_v);
+  } else if (trans==Sym) {
+    if (NrowBlocksT)
+      gatherScatterKernel[type][Add](NrowBlocksT,
+                                     k,
+                                     o_blockRowStartsT,
+                                     o_rowStartsT,
+                                     o_colIdsT,
+                                     o_rowStartsT,
+                                     o_colIdsT,
+                                     o_v);
+  } else {
+    if (NrowBlocksT)
+      gatherScatterKernel[type][Add](NrowBlocksT,
+                                     k,
+                                     o_blockRowStartsT,
+                                     o_rowStartsN,
+                                     o_colIdsN,
+                                     o_rowStartsT,
+                                     o_colIdsT,
+                                     o_v);
+  }
+}
+
+template
+void ogsOperator_t::GatherScatter(deviceMemory<float> v,const int k,
+                                  const Op op, const Transpose trans);
+template
+void ogsOperator_t::GatherScatter(deviceMemory<double> v,const int k,
+                                  const Op op, const Transpose trans);
+template
+void ogsOperator_t::GatherScatter(deviceMemory<int> v,const int k,
+                                  const Op op, const Transpose trans);
+template
+void ogsOperator_t::GatherScatter(deviceMemory<long long int> v,const int k,
+                                  const Op op, const Transpose trans);
+
+void ogsOperator_t::setupRowBlocks() {
+
+  dlong blockSumN=0, blockSumT=0;
+  NrowBlocksN=0, NrowBlocksT=0;
+
+  if (NrowsN) NrowBlocksN++;
+  if (NrowsT) NrowBlocksT++;
+
+  for (dlong i=0;i<NrowsT;i++) {
+    const dlong rowSizeN  = rowStartsN[i+1]-rowStartsN[i];
+    const dlong rowSizeT  = rowStartsT[i+1]-rowStartsT[i];
+
+    //this row is pathalogically big. We can't currently run this
+    LIBP_ABORT("Multiplicity of global node id: " << i
+               << " in ogsOperator_t::setupRowBlocks is too large.",
+               rowSizeN > gatherNodesPerBlock);
+    LIBP_ABORT("Multiplicity of global node id: " << i
+               << " in ogsOperator_t::setupRowBlocks is too large.",
+               rowSizeT > gatherNodesPerBlock);
+
+    if (blockSumN+rowSizeN > gatherNodesPerBlock) { //adding this row will exceed the nnz per block
+      NrowBlocksN++; //count the previous block
+      blockSumN=rowSizeN; //start a new row block
+    } else {
+      blockSumN+=rowSizeN; //add this row to the block
+    }
+
+    if (blockSumT+rowSizeT > gatherNodesPerBlock) { //adding this row will exceed the nnz per block
+      NrowBlocksT++; //count the previous block
+      blockSumT=rowSizeT; //start a new row block
+    } else {
+      blockSumT+=rowSizeT; //add this row to the block
+    }
+  }
+
+  blockRowStartsN.calloc(NrowBlocksN+1);
+  blockRowStartsT.calloc(NrowBlocksT+1);
+
+  blockSumN=0, blockSumT=0;
+  NrowBlocksN=0, NrowBlocksT=0;
+  if (NrowsN) NrowBlocksN++;
+  if (NrowsT) NrowBlocksT++;
+
+  for (dlong i=0;i<NrowsT;i++) {
+    const dlong rowSizeN  = rowStartsN[i+1]-rowStartsN[i];
+    const dlong rowSizeT  = rowStartsT[i+1]-rowStartsT[i];
+
+    if (blockSumN+rowSizeN > gatherNodesPerBlock) { //adding this row will exceed the nnz per block
+      blockRowStartsN[NrowBlocksN++] = i; //mark the previous block
+      blockSumN=rowSizeN; //start a new row block
+    } else {
+      blockSumN+=rowSizeN; //add this row to the block
+    }
+    if (blockSumT+rowSizeT > gatherNodesPerBlock) { //adding this row will exceed the nnz per block
+      blockRowStartsT[NrowBlocksT++] = i; //mark the previous block
+      blockSumT=rowSizeT; //start a new row block
+    } else {
+      blockSumT+=rowSizeT; //add this row to the block
+    }
+  }
+  blockRowStartsN[NrowBlocksN] = NrowsN;
+  blockRowStartsT[NrowBlocksT] = NrowsT;
+
+  o_blockRowStartsN = platform.malloc(blockRowStartsN);
+  o_blockRowStartsT = platform.malloc(blockRowStartsT);
+}
+
+void ogsOperator_t::Free() {
+  rowStartsT.free();
+  colIdsT.free();
+  rowStartsN.free();
+  colIdsN.free();
+
+  o_rowStartsT.free();
+  o_colIdsT.free();
+  o_rowStartsN.free();
+  o_colIdsN.free();
+
+  blockRowStartsT.free();
+  blockRowStartsN.free();
+  o_blockRowStartsN.free();
+  o_blockRowStartsT.free();
+
+  nnzN=0;
+  nnzT=0;
+  NrowsN=0;
+  NrowsT=0;
+  Ncols=0;
+  NrowBlocksN=0;
+  NrowBlocksT=0;
+}
+
+
+template <template<typename> class U,
+          template<typename> class V,
+          typename T>
+void extract(const dlong N,
+             const int K,
+             const memory<dlong> ids,
+             const U<T> q,
+             V<T> gatherq) {
+
+  const T*__restrict__ q_ptr = q.ptr();
+  T*__restrict__ gatherq_ptr = gatherq.ptr();
+
+  if (K==1) {
+    for(dlong n=0;n<N;++n){
+      const dlong gid = ids[n];
+      gatherq_ptr[n] = q_ptr[gid];
+    }
+  } else {
+    for(dlong n=0;n<N;++n){
+      const dlong gid = ids[n];
+      for (int k=0;k<K;++k) {
+        gatherq_ptr[k+n*K] = q_ptr[k+gid*K];
+      }
+    }
+  }
+}
+
+template void extract(const dlong N, const int K, const memory<dlong> ids,
+                      const memory<float> q, memory<float> gatherq);
+template void extract(const dlong N, const int K, const memory<dlong> ids,
+                      const memory<double> q, memory<double> gatherq);
+template void extract(const dlong N, const int K, const memory<dlong> ids,
+                      const memory<int> q, memory<int> gatherq);
+template void extract(const dlong N, const int K, const memory<dlong> ids,
+                      const memory<long long int> q, memory<long long int> gatherq);
+
+template void extract(const dlong N, const int K, const memory<dlong> ids,
+                      const pinnedMemory<float> q, pinnedMemory<float> gatherq);
+template void extract(const dlong N, const int K, const memory<dlong> ids,
+                      const pinnedMemory<double> q, pinnedMemory<double> gatherq);
+template void extract(const dlong N, const int K, const memory<dlong> ids,
+                      const pinnedMemory<int> q, pinnedMemory<int> gatherq);
+template void extract(const dlong N, const int K, const memory<dlong> ids,
+                      const pinnedMemory<long long int> q, pinnedMemory<long long int> gatherq);
+
+} //namespace ogs
+
+} //namespace libp
diff --git a/libs/ogs/ogsPairwise.cpp b/libs/ogs/ogsPairwise.cpp
new file mode 100644
index 000000000..91098328d
--- /dev/null
+++ b/libs/ogs/ogsPairwise.cpp
@@ -0,0 +1,430 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "ogs.hpp"
+#include "ogs/ogsUtils.hpp"
+#include "ogs/ogsExchange.hpp"
+
+#ifdef GLIBCXX_PARALLEL
+#include <parallel/algorithm>
+using __gnu_parallel::sort;
+#else
+using std::sort;
+#endif
+
+namespace libp {
+
+namespace ogs {
+
+/**********************************
+* Host exchange
+***********************************/
+template<typename T>
+inline void ogsPairwise_t::Start(pinnedMemory<T> &buf, const int k,
+                          const Op op, const Transpose trans){
+
+  pinnedMemory<T> sendBuf = h_sendspace;
+
+  const int NranksSend  = (trans==NoTrans) ? NranksSendN  : NranksSendT;
+  const int NranksRecv  = (trans==NoTrans) ? NranksRecvN  : NranksRecvT;
+  const int *sendRanks  = (trans==NoTrans) ? sendRanksN.ptr()   : sendRanksT.ptr();
+  const int *recvRanks  = (trans==NoTrans) ? recvRanksN.ptr()   : recvRanksT.ptr();
+  const int *sendCounts = (trans==NoTrans) ? sendCountsN.ptr()  : sendCountsT.ptr();
+  const int *recvCounts = (trans==NoTrans) ? recvCountsN.ptr()  : recvCountsT.ptr();
+  const int *sendOffsets= (trans==NoTrans) ? sendOffsetsN.ptr() : sendOffsetsT.ptr();
+  const int *recvOffsets= (trans==NoTrans) ? recvOffsetsN.ptr() : recvOffsetsT.ptr();
+
+  //post recvs
+  for (int r=0;r<NranksRecv;r++) {
+    comm.Irecv(buf + Nhalo*k + recvOffsets[r]*k,
+               recvRanks[r],
+               k*recvCounts[r],
+               recvRanks[r],
+               requests[r]);
+  }
+
+  // extract the send buffer
+  if (trans == NoTrans)
+    extract(NsendN, k, sendIdsN, buf, sendBuf);
+  else
+    extract(NsendT, k, sendIdsT, buf, sendBuf);
+
+  //post sends
+  for (int r=0;r<NranksSend;r++) {
+    comm.Isend(sendBuf + sendOffsets[r]*k,
+              sendRanks[r],
+              k*sendCounts[r],
+              rank,
+              requests[NranksRecv+r]);
+  }
+}
+
+template<typename T>
+inline void ogsPairwise_t::Finish(pinnedMemory<T> &buf, const int k,
+                           const Op op, const Transpose trans){
+
+  const int NranksSend  = (trans==NoTrans) ? NranksSendN  : NranksSendT;
+  const int NranksRecv  = (trans==NoTrans) ? NranksRecvN  : NranksRecvT;
+  const int *recvOffsets= (trans==NoTrans) ? recvOffsetsN.ptr() : recvOffsetsT.ptr();
+
+  comm.Waitall(NranksRecv+NranksSend, requests);
+
+  //if we recvieved anything via MPI, gather the recv buffer and scatter
+  // it back to to original vector
+  dlong Nrecv = recvOffsets[NranksRecv];
+  if (Nrecv) {
+    // gather the recieved nodes
+    postmpi.Gather(buf, buf, k, op, trans);
+  }
+}
+
+void ogsPairwise_t::Start(pinnedMemory<float> &buf, const int k, const Op op, const Transpose trans) { Start<float>(buf, k, op, trans); }
+void ogsPairwise_t::Start(pinnedMemory<double> &buf, const int k, const Op op, const Transpose trans) { Start<double>(buf, k, op, trans); }
+void ogsPairwise_t::Start(pinnedMemory<int> &buf, const int k, const Op op, const Transpose trans) { Start<int>(buf, k, op, trans); }
+void ogsPairwise_t::Start(pinnedMemory<long long int> &buf, const int k, const Op op, const Transpose trans) { Start<long long int>(buf, k, op, trans); }
+void ogsPairwise_t::Finish(pinnedMemory<float> &buf, const int k, const Op op, const Transpose trans) { Finish<float>(buf, k, op, trans); }
+void ogsPairwise_t::Finish(pinnedMemory<double> &buf, const int k, const Op op, const Transpose trans) { Finish<double>(buf, k, op, trans); }
+void ogsPairwise_t::Finish(pinnedMemory<int> &buf, const int k, const Op op, const Transpose trans) { Finish<int>(buf, k, op, trans); }
+void ogsPairwise_t::Finish(pinnedMemory<long long int> &buf, const int k, const Op op, const Transpose trans) { Finish<long long int>(buf, k, op, trans); }
+
+/**********************************
+* GPU-aware exchange
+***********************************/
+template<typename T>
+void ogsPairwise_t::Start(deviceMemory<T> &o_buf,
+                          const int k,
+                          const Op op,
+                          const Transpose trans){
+
+  const dlong Nsend = (trans == NoTrans) ? NsendN : NsendT;
+
+  if (Nsend) {
+    deviceMemory<T> o_sendBuf = o_sendspace;
+
+    //  assemble the send buffer on device
+    if (trans == NoTrans) {
+      extractKernel[ogsType<T>::get()](NsendN, k, o_sendIdsN, o_buf, o_sendBuf);
+    } else {
+      extractKernel[ogsType<T>::get()](NsendT, k, o_sendIdsT, o_buf, o_sendBuf);
+    }
+    //wait for kernel to finish on default stream
+    device_t &device = platform.device;
+    device.finish();
+  }
+}
+
+template<typename T>
+void ogsPairwise_t::Finish(deviceMemory<T> &o_buf,
+                           const int k,
+                           const Op op,
+                           const Transpose trans){
+
+  deviceMemory<T> o_sendBuf = o_sendspace;
+
+  const int NranksSend  = (trans==NoTrans) ? NranksSendN  : NranksSendT;
+  const int NranksRecv  = (trans==NoTrans) ? NranksRecvN  : NranksRecvT;
+  const int *sendRanks  = (trans==NoTrans) ? sendRanksN.ptr()   : sendRanksT.ptr();
+  const int *recvRanks  = (trans==NoTrans) ? recvRanksN.ptr()   : recvRanksT.ptr();
+  const int *sendCounts = (trans==NoTrans) ? sendCountsN.ptr()  : sendCountsT.ptr();
+  const int *recvCounts = (trans==NoTrans) ? recvCountsN.ptr()  : recvCountsT.ptr();
+  const int *sendOffsets= (trans==NoTrans) ? sendOffsetsN.ptr() : sendOffsetsT.ptr();
+  const int *recvOffsets= (trans==NoTrans) ? recvOffsetsN.ptr() : recvOffsetsT.ptr();
+
+  //post recvs
+  for (int r=0;r<NranksRecv;r++) {
+    comm.Irecv(o_buf + Nhalo*k + recvOffsets[r]*k,
+              recvRanks[r],
+              k*recvCounts[r],
+              recvRanks[r],
+              requests[r]);
+  }
+
+  //post sends
+  for (int r=0;r<NranksSend;r++) {
+    comm.Isend(o_sendBuf + sendOffsets[r]*k,
+              sendRanks[r],
+              k*sendCounts[r],
+              rank,
+              requests[NranksRecv+r]);
+  }
+
+  comm.Waitall(NranksRecv+NranksSend, requests);
+
+  //if we recvieved anything via MPI, gather the recv buffer and scatter
+  // it back to to original vector
+  dlong Nrecv = recvOffsets[NranksRecv];
+  if (Nrecv) {
+    // gather the recieved nodes on device
+    postmpi.Gather(o_buf, o_buf, k, op, trans);
+  }
+}
+
+void ogsPairwise_t::Start(deviceMemory<float> &buf, const int k, const Op op, const Transpose trans) { Start<float>(buf, k, op, trans); }
+void ogsPairwise_t::Start(deviceMemory<double> &buf, const int k, const Op op, const Transpose trans) { Start<double>(buf, k, op, trans); }
+void ogsPairwise_t::Start(deviceMemory<int> &buf, const int k, const Op op, const Transpose trans) { Start<int>(buf, k, op, trans); }
+void ogsPairwise_t::Start(deviceMemory<long long int> &buf, const int k, const Op op, const Transpose trans) { Start<long long int>(buf, k, op, trans); }
+void ogsPairwise_t::Finish(deviceMemory<float> &buf, const int k, const Op op, const Transpose trans) { Finish<float>(buf, k, op, trans); }
+void ogsPairwise_t::Finish(deviceMemory<double> &buf, const int k, const Op op, const Transpose trans) { Finish<double>(buf, k, op, trans); }
+void ogsPairwise_t::Finish(deviceMemory<int> &buf, const int k, const Op op, const Transpose trans) { Finish<int>(buf, k, op, trans); }
+void ogsPairwise_t::Finish(deviceMemory<long long int> &buf, const int k, const Op op, const Transpose trans) { Finish<long long int>(buf, k, op, trans); }
+
+ogsPairwise_t::ogsPairwise_t(dlong Nshared,
+                             memory<parallelNode_t> &sharedNodes,
+                             ogsOperator_t& gatherHalo,
+                             stream_t _dataStream,
+                             comm_t _comm,
+                             platform_t &_platform):
+  ogsExchange_t(_platform,_comm,_dataStream) {
+
+  Nhalo  = gatherHalo.NrowsT;
+  NhaloP = gatherHalo.NrowsN;
+
+  // sort the list by rank to the order where they will be sent by MPI_Allgatherv
+  sort(sharedNodes.ptr(), sharedNodes.ptr()+Nshared,
+       [](const parallelNode_t& a, const parallelNode_t& b) {
+         if(a.rank < b.rank) return true; //group by rank
+         if(a.rank > b.rank) return false;
+
+         return a.newId < b.newId; //then order by the localId relative to this rank
+       });
+
+  //make mpi allgatherv counts and offsets
+  memory<int> mpiSendCountsT(size,0);
+  memory<int> mpiSendCountsN(size,0);
+  memory<int> mpiRecvCountsT(size);
+  memory<int> mpiRecvCountsN(size);
+  memory<int> mpiSendOffsetsT(size+1);
+  memory<int> mpiSendOffsetsN(size+1);
+  memory<int> mpiRecvOffsetsT(size+1);
+  memory<int> mpiRecvOffsetsN(size+1);
+
+  for (dlong n=0;n<Nshared;n++) { //loop through nodes we need to send
+    const int r = sharedNodes[n].rank;
+    if (sharedNodes[n].sign>0) mpiSendCountsN[r]++;
+    mpiSendCountsT[r]++;
+  }
+
+  //shared counts
+  comm.Alltoall(mpiSendCountsT, mpiRecvCountsT);
+  comm.Alltoall(mpiSendCountsN, mpiRecvCountsN);
+
+  //cumulative sum
+  mpiSendOffsetsN[0] = 0;
+  mpiSendOffsetsT[0] = 0;
+  mpiRecvOffsetsN[0] = 0;
+  mpiRecvOffsetsT[0] = 0;
+  for (int r=0;r<size;r++) {
+    mpiSendOffsetsN[r+1] = mpiSendOffsetsN[r]+mpiSendCountsN[r];
+    mpiSendOffsetsT[r+1] = mpiSendOffsetsT[r]+mpiSendCountsT[r];
+    mpiRecvOffsetsN[r+1] = mpiRecvOffsetsN[r]+mpiRecvCountsN[r];
+    mpiRecvOffsetsT[r+1] = mpiRecvOffsetsT[r]+mpiRecvCountsT[r];
+  }
+
+  //make ops for scattering halo nodes before sending
+  NsendN=mpiSendOffsetsN[size];
+  NsendT=mpiSendOffsetsT[size];
+
+  sendIdsN.calloc(NsendN);
+  sendIdsT.calloc(NsendT);
+
+  NsendN=0; //positive node count
+  NsendT=0; //all node count
+
+  for (dlong n=0;n<Nshared;n++) { //loop through nodes we need to send
+    dlong id = sharedNodes[n].newId; //coalesced index for this baseId on this rank
+    if (sharedNodes[n].sign==2) {
+      sendIdsN[NsendN++] = id;
+    }
+    sendIdsT[NsendT++] = id;
+  }
+  o_sendIdsT = platform.malloc(sendIdsT);
+  o_sendIdsN = platform.malloc(sendIdsN);
+
+  //send the node lists so we know what we'll receive
+  dlong Nrecv = mpiRecvOffsetsT[size];
+  memory<parallelNode_t> recvNodes(Nrecv);
+
+  //Send list of nodes to each rank
+  comm.Alltoallv(sharedNodes, mpiSendCountsT, mpiSendOffsetsT,
+                   recvNodes, mpiRecvCountsT, mpiRecvOffsetsT);
+
+  //make ops for gathering halo nodes after an MPI_Allgatherv
+  postmpi.platform = platform;
+  postmpi.kind = Signed;
+
+  postmpi.NrowsN = Nhalo;
+  postmpi.NrowsT = Nhalo;
+  postmpi.rowStartsN.calloc(Nhalo+1);
+  postmpi.rowStartsT.calloc(Nhalo+1);
+
+  //make array of counters
+  memory<dlong> haloGatherTCounts(Nhalo);
+  memory<dlong> haloGatherNCounts(Nhalo);
+
+  //count the data that will already be in h_haloBuf.ptr()
+  for (dlong n=0;n<Nhalo;n++) {
+    haloGatherNCounts[n] = (n<NhaloP) ? 1 : 0;
+    haloGatherTCounts[n] = 1;
+  }
+
+  for (dlong n=0;n<Nrecv;n++) { //loop through nodes needed for gathering halo nodes
+    dlong id = recvNodes[n].localId; //coalesced index for this baseId on this rank
+    if (recvNodes[n].sign==2) haloGatherNCounts[id]++;  //tally
+    haloGatherTCounts[id]++;  //tally
+  }
+
+  for (dlong i=0;i<Nhalo;i++) {
+    postmpi.rowStartsN[i+1] = postmpi.rowStartsN[i] + haloGatherNCounts[i];
+    postmpi.rowStartsT[i+1] = postmpi.rowStartsT[i] + haloGatherTCounts[i];
+    haloGatherNCounts[i] = 0;
+    haloGatherTCounts[i] = 0;
+  }
+  postmpi.nnzN = postmpi.rowStartsN[Nhalo];
+  postmpi.nnzT = postmpi.rowStartsT[Nhalo];
+  postmpi.colIdsN.calloc(postmpi.nnzN);
+  postmpi.colIdsT.calloc(postmpi.nnzT);
+
+  for (dlong n=0;n<NhaloP;n++) {
+    const dlong soffset = postmpi.rowStartsN[n];
+    const int sindex  = haloGatherNCounts[n];
+    postmpi.colIdsN[soffset+sindex] = n; //record id
+    haloGatherNCounts[n]++;
+  }
+  for (dlong n=0;n<Nhalo;n++) {
+    const dlong soffset = postmpi.rowStartsT[n];
+    const int sindex  = haloGatherTCounts[n];
+    postmpi.colIdsT[soffset+sindex] = n; //record id
+    haloGatherTCounts[n]++;
+  }
+
+  dlong cnt=Nhalo; //positive node count
+  for (dlong n=0;n<Nrecv;n++) { //loop through nodes we need to send
+    dlong id = recvNodes[n].localId; //coalesced index for this baseId on this rank
+    if (recvNodes[n].sign==2) {
+      const dlong soffset = postmpi.rowStartsN[id];
+      const int sindex  = haloGatherNCounts[id];
+      postmpi.colIdsN[soffset+sindex] = cnt++; //record id
+      haloGatherNCounts[id]++;
+    }
+    const dlong soffset = postmpi.rowStartsT[id];
+    const int sindex  = haloGatherTCounts[id];
+    postmpi.colIdsT[soffset+sindex] = n + Nhalo; //record id
+    haloGatherTCounts[id]++;
+  }
+
+  postmpi.o_rowStartsN = platform.malloc(postmpi.rowStartsN);
+  postmpi.o_rowStartsT = platform.malloc(postmpi.rowStartsT);
+  postmpi.o_colIdsN = platform.malloc(postmpi.colIdsN);
+  postmpi.o_colIdsT = platform.malloc(postmpi.colIdsT);
+
+  //free up space
+  recvNodes.free();
+  haloGatherNCounts.free();
+  haloGatherTCounts.free();
+
+  postmpi.setupRowBlocks();
+
+  //compress the send/recv counts to pairwise exchanges
+  NranksSendN=0;
+  NranksSendT=0;
+  NranksRecvN=0;
+  NranksRecvT=0;
+  for (int r=0;r<size;r++) {
+    NranksSendN += (mpiSendCountsN[r]>0) ? 1 : 0;
+    NranksSendT += (mpiSendCountsT[r]>0) ? 1 : 0;
+    NranksRecvN += (mpiRecvCountsN[r]>0) ? 1 : 0;
+    NranksRecvT += (mpiRecvCountsT[r]>0) ? 1 : 0;
+  }
+
+  sendRanksN.calloc(NranksSendN);
+  sendRanksT.calloc(NranksSendT);
+  recvRanksN.calloc(NranksRecvN);
+  recvRanksT.calloc(NranksRecvT);
+  sendCountsN.calloc(NranksSendN);
+  sendCountsT.calloc(NranksSendT);
+  recvCountsN.calloc(NranksRecvN);
+  recvCountsT.calloc(NranksRecvT);
+  sendOffsetsN.calloc(NranksSendN+1);
+  sendOffsetsT.calloc(NranksSendT+1);
+  recvOffsetsN.calloc(NranksRecvN+1);
+  recvOffsetsT.calloc(NranksRecvT+1);
+
+  //reset
+  NranksSendN=0;
+  NranksSendT=0;
+  NranksRecvN=0;
+  NranksRecvT=0;
+  for (int r=0;r<size;r++) {
+    if (mpiSendCountsN[r]>0) {
+      sendRanksN[NranksSendN]  = r;
+      sendCountsN[NranksSendN] = mpiSendCountsN[r];
+      sendOffsetsN[NranksSendN] = mpiSendOffsetsN[r];
+      NranksSendN++;
+    }
+    if (mpiSendCountsT[r]>0) {
+      sendRanksT[NranksSendT]  = r;
+      sendCountsT[NranksSendT] = mpiSendCountsT[r];
+      sendOffsetsT[NranksSendT] = mpiSendOffsetsT[r];
+      NranksSendT++;
+    }
+    if (mpiRecvCountsN[r]>0) {
+      recvRanksN[NranksRecvN]   = r;
+      recvCountsN[NranksRecvN]  = mpiRecvCountsN[r];
+      recvOffsetsN[NranksRecvN] = mpiRecvOffsetsN[r];
+      NranksRecvN++;
+    }
+    if (mpiRecvCountsT[r]>0) {
+      recvRanksT[NranksRecvT]   = r;
+      recvCountsT[NranksRecvT]  = mpiRecvCountsT[r];
+      recvOffsetsT[NranksRecvT] = mpiRecvOffsetsT[r];
+      NranksRecvT++;
+    }
+  }
+  sendOffsetsN[NranksSendN] = mpiSendOffsetsN[size];
+  sendOffsetsT[NranksSendT] = mpiSendOffsetsT[size];
+  recvOffsetsN[NranksRecvN] = mpiRecvOffsetsN[size];
+  recvOffsetsT[NranksRecvT] = mpiRecvOffsetsT[size];
+
+  requests.malloc(NranksSendT+NranksRecvT);
+
+  //make scratch space
+  AllocBuffer(sizeof(dfloat));
+}
+
+void ogsPairwise_t::AllocBuffer(size_t Nbytes) {
+  if (o_workspace.size() < postmpi.nnzT*Nbytes) {
+    h_workspace = platform.hostMalloc<char>(postmpi.nnzT*Nbytes);
+    o_workspace = platform.malloc<char>(postmpi.nnzT*Nbytes);
+  }
+  if (o_sendspace.size() < NsendT*Nbytes) {
+    h_sendspace = platform.hostMalloc<char>(NsendT*Nbytes);
+    o_sendspace = platform.malloc<char>(NsendT*Nbytes);
+  }
+}
+
+} //namespace ogs
+
+} //namespace libp
diff --git a/libs/ogs/ogsSetup.cpp b/libs/ogs/ogsSetup.cpp
index 8993cd894..0eee174dd 100644
--- a/libs/ogs/ogsSetup.cpp
+++ b/libs/ogs/ogsSetup.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,520 +25,896 @@ SOFTWARE.
 */
 
 #include "ogs.hpp"
-#include "ogs/ogsKernels.hpp"
-
-typedef struct{
+#include "ogs/ogsUtils.hpp"
+#include "ogs/ogsOperator.hpp"
+#include "ogs/ogsExchange.hpp"
+#include "timer.hpp"
+
+#ifdef GLIBCXX_PARALLEL
+#include <parallel/algorithm>
+using __gnu_parallel::sort;
+#else
+using std::sort;
+#endif
+
+namespace libp {
+
+namespace ogs {
+
+void ogs_t::Setup(const dlong _N,
+                  memory<hlong> ids,
+                  comm_t _comm,
+                  const Kind _kind,
+                  const Method method,
+                  const bool _unique,
+                  const bool verbose,
+                  platform_t& _platform){
+  ogsBase_t::Setup(_N, ids, _comm, _kind, method, _unique, verbose, _platform);
+}
 
-  dlong localId;    // local node id
-  hlong baseId;     // original global index
+void halo_t::Setup(const dlong _N,
+                  memory<hlong> ids,
+                  comm_t _comm,
+                  const Method method,
+                  const bool verbose,
+                  platform_t& _platform){
+  ogsBase_t::Setup(_N, ids, _comm, Halo, method, false, verbose, _platform);
 
-  dlong newId;         // new global id
-  int sign;
+  Nhalo = NhaloT - NhaloP; //number of extra recieved nodes
+}
 
-}parallelNode_t;
+/********************************
+ * Setup
+ ********************************/
+void ogsBase_t::Setup(const dlong _N,
+                      memory<hlong> ids,
+                      comm_t _comm,
+                      const Kind _kind,
+                      const Method method,
+                      const bool _unique,
+                      const bool verbose,
+                      platform_t& _platform){
 
-void setupRowBlocks(ogsData_t &A, platform_t &platform);
+  //release resources if this ogs was setup before
+  Free();
 
-ogs_t *ogs_t::Setup(dlong N, hlong *ids, MPI_Comm &comm,
-                    int verbose, platform_t& platform){
+  timePoint_t start = Time();
 
-  ogs_t *ogs = new ogs_t(platform, comm);
+  platform = _platform;
 
-  //Keep track of how many gs handles we've created, and
-  // build kernels if this is the first
-  if (!ogs::Nrefs) ogs::initKernels(platform);
-  ogs::Nrefs++;
+  if (!dataStream.isInitialized())
+      dataStream = platform.device.createStream();
 
-  ogs->N = N;
+  N = _N;
+  comm = _comm;
+  kind = _kind;
+  unique = _unique;
 
   int rank, size;
-  MPI_Comm_rank(ogs->comm, &rank);
-  MPI_Comm_size(ogs->comm, &size);
-
-  //use the host gs to find what nodes are local to this rank
-  int *minRank = (int *) calloc(N,sizeof(int));
-  int *maxRank = (int *) calloc(N,sizeof(int));
-  hlong *flagIds   = (hlong *) calloc(N,sizeof(hlong));
-  for (dlong i=0;i<N;i++) {
-    minRank[i] = rank;
-    maxRank[i] = rank;
-    flagIds[i] = abs(ids[i]); //ignore negative ids for this
+  rank = comm.rank();
+  size = comm.size();
+
+  //sanity check options
+  LIBP_ABORT("Invalid ogs setup requested",
+             (kind==Unsigned && unique==true)
+              || (kind==Halo && unique==true));
+
+  //count how many ids are non-zero
+  dlong Nids=0;
+  for (dlong n=0;n<N;n++)
+    if (ids[n]!=0) Nids++;
+
+  // make list of nodes
+  memory<parallelNode_t> nodes(Nids);
+
+  //fill the data (squeezing out zero ids)
+  Nids=0;
+  for (dlong n=0;n<N;n++) {
+    if (ids[n]!=0) {
+      nodes[Nids].localId = Nids; //record a compressed id first (useful for ordering)
+      nodes[Nids].baseId = (kind==Unsigned) ?
+                            abs(ids[n]) : ids[n]; //record global id
+      nodes[Nids].rank = rank;
+      nodes[Nids].destRank = abs(ids[n]) % size;
+      Nids++;
+    }
   }
 
-  //make a host gs handle (calls gslib)
-  void *gsHandle = ogs::gsSetup(comm, N, flagIds, 0, 0);
-  ogs::gsGatherScatter(minRank, 1, 1, 0, ogs_int, ogs_min, ogs_notrans, gsHandle);
-  ogs::gsGatherScatter(maxRank, 1, 1, 0, ogs_int, ogs_max, ogs_notrans, gsHandle);
-  ogs::gsFree(gsHandle); //discard the large gs handle
-  free(flagIds);
+  //flag which nodes are shared via MPI
+  FindSharedNodes(Nids, nodes, verbose);
 
-  //minRank[n] contains the smallest rank taking part in the gatherScatter of node n
-  //maxRank[n] contains the largest rank taking part in the gatherScatter of node n
+  //Index the local and halo baseIds on this rank and
+  // construct sharedNodes which contains all the info
+  // we need to setup the MPI exchange.
+  dlong Nshared=0;
+  memory<parallelNode_t> sharedNodes;
+  ConstructSharedNodes(Nids, nodes, Nshared, sharedNodes);
 
-  //count local and halo nodes
-  ogs->Nlocal=0; ogs->Nhalo=0;
-  for (dlong i=0;i<N;i++) {
-    if (ids[i]==0) continue;
+  Nids=0;
+  for (dlong n=0;n<N;n++) {
+    if (ids[n]!=0) {
+      nodes[Nids].localId = n; //record the real id now
 
-    if ((minRank[i]!=rank)||(maxRank[i]!=rank)) {
-      ogs->Nhalo++;
-    } else {
-      ogs->Nlocal++;
+      //if we altered the signs of ids, write them back
+      if (unique)
+        ids[n] = nodes[Nids].baseId;
+
+      Nids++;
     }
   }
 
-  //set up the local gatherScatter
-  parallelNode_t *localNodes = (parallelNode_t*) calloc(ogs->Nlocal,sizeof(parallelNode_t));
+  //setup local gather operators
+  if (kind==Signed)
+    LocalSignedSetup(Nids, nodes);
+  else if (kind==Unsigned)
+    LocalUnsignedSetup(Nids, nodes);
+  else
+    LocalHaloSetup(Nids, nodes);
+
+  //with that, we're done with the local nodes list
+  nodes.free();
+
+  // At this point, we've setup gs operators to gather/scatter the purely local nodes,
+  // and gather/scatter the shared halo nodes to/from a coalesced ordering. We now
+  // need gs operators to scatter/gather the coalesced halo nodes to/from the expected
+  // orderings for MPI communications.
+
+  if (method == AllToAll) {
+    exchange = std::shared_ptr<ogsExchange_t>(
+                  new ogsAllToAll_t(Nshared, sharedNodes,
+                                    *gatherHalo, dataStream,
+                                    comm, platform));
+  } else if (method == Pairwise) {
+    exchange = std::shared_ptr<ogsExchange_t>(
+                  new ogsPairwise_t(Nshared, sharedNodes,
+                                    *gatherHalo, dataStream,
+                                    comm, platform));
+  } else if (method == CrystalRouter) {
+    exchange = std::shared_ptr<ogsExchange_t>(
+                  new ogsCrystalRouter_t(Nshared, sharedNodes,
+                                         *gatherHalo, dataStream,
+                                         comm, platform));
+  } else { //Auto
+    exchange = std::shared_ptr<ogsExchange_t>(
+                  AutoSetup(Nshared, sharedNodes,
+                            *gatherHalo, comm,
+                            platform, verbose));
+  }
 
-  dlong cnt=0;
-  for (dlong i=0;i<N;i++) {
-    if (ids[i]==0) continue;
+  timePoint_t end = GlobalPlatformTime(platform, comm);
+  double elapsedTime = ElapsedTime(start, end);
 
-    if ((minRank[i]==rank)&&(maxRank[i]==rank)) {
-      localNodes[cnt].localId = i;
-      localNodes[cnt].baseId  = ids[i];
-      cnt++;
-    }
+  if (!rank && verbose) {
+    std::cout << "ogs Setup Time: " << elapsedTime << " seconds." << std::endl;
   }
+}
 
-  // sort based on base ids (putting positive ids first) then local id
-  std::sort(localNodes, localNodes+ogs->Nlocal,
-            [](const parallelNode_t& a, const parallelNode_t& b) {
-              if(abs(a.baseId) < abs(b.baseId)) return true; //group by abs(baseId)
-              if(abs(a.baseId) > abs(b.baseId)) return false;
-
-              if(a.baseId > b.baseId) return true; //positive ids first
-              if(a.baseId < b.baseId) return false;
-
-              return (a.localId < b.localId); //sort by local id
-            });
-
-  //flag each set of ids by whether there is at least one positive id
-  // and count how many local gather/scatter nodes we have
-  ogs->localGather.Nrows = 0;
-  ogs->localScatter.Nrows = 0;
-  if (ogs->Nlocal) {
-    localNodes[0].newId = 0;
-    int sign = (localNodes[0].baseId > 0) ? 1 : -1;
-    localNodes[0].sign = sign;
-    if (sign > 0) ogs->localGather.Nrows++;
-
-    for (dlong i=1;i<ogs->Nlocal;i++) {
-      if (abs(localNodes[i].baseId)!=abs(localNodes[i-1].baseId)) {
-        sign = (localNodes[i].baseId > 0) ? 1 : -1;
-        ogs->localScatter.Nrows++;
-        if (sign > 0) ogs->localGather.Nrows++;
-      }
+void ogsBase_t::FindSharedNodes(const dlong Nids,
+                                memory<parallelNode_t> &nodes,
+                                const int verbose){
 
-      localNodes[i].newId = ogs->localScatter.Nrows;
-      localNodes[i].sign = sign;
-    }
-    ogs->localScatter.Nrows++;
+  int rank, size;
+  rank = comm.rank();
+  size = comm.size();
+
+  memory<int> sendCounts(size,0);
+  memory<int> recvCounts(size);
+  memory<int> sendOffsets(size+1);
+  memory<int> recvOffsets(size+1);
+
+  //count number of ids we're sending
+  for (dlong n=0;n<Nids;n++) {
+    sendCounts[nodes[n].destRank]++;
   }
 
-  // sort back to local ids
-  std::sort(localNodes, localNodes+ogs->Nlocal,
-            [](const parallelNode_t& a, const parallelNode_t& b) {
-              return (a.localId < b.localId); //sort by local id
-            });
+  comm.Alltoall(sendCounts, recvCounts);
 
-  //tally up how many nodes are being gathered to each gatherNode and
-  //  map to a local ordering
-  dlong *localGatherCounts  = (dlong*) calloc(ogs->localScatter.Nrows,sizeof(dlong));
-  dlong *localScatterCounts = (dlong*) calloc(ogs->localScatter.Nrows,sizeof(dlong));
+  sendOffsets[0] = 0;
+  recvOffsets[0] = 0;
+  for (int r=0;r<size;r++) {
+    sendOffsets[r+1] = sendOffsets[r]+sendCounts[r];
+    recvOffsets[r+1] = recvOffsets[r]+recvCounts[r];
 
-  dlong *localMap = (dlong*) calloc(ogs->localScatter.Nrows,sizeof(dlong));
+    //reset counter
+    sendCounts[r] = 0;
+  }
 
-  for (dlong i=0;i<ogs->localScatter.Nrows;i++) localMap[i] = -1; //initialize map
+  //write a send ordering into newIds
+  for (dlong n=0;n<Nids;n++) {
+    const int r = nodes[n].destRank;
+    nodes[n].newId = sendOffsets[r]+sendCounts[r]++;
+  }
 
-  cnt = 0;
-  dlong cnt2 = ogs->localGather.Nrows;
-  for (dlong i=0;i<ogs->Nlocal;i++) {
-    dlong newId = localNodes[i].newId; //get the ordered id
+  // permute the list to send ordering
+  permute(Nids, nodes, [](const parallelNode_t& a) { return a.newId; } );
 
-    //record a new index if this is a new gatherNode (pure negative nodes appended at the end)
-    if (localMap[newId]==-1) {
-      if (localNodes[i].sign > 0)
-        localMap[newId] = cnt++;
-      else
-        localMap[newId] = cnt2++;
-    }
+  dlong recvN = recvOffsets[size]; //total ids to recv
 
-    dlong gid = localMap[newId];
-    localNodes[i].newId = gid; //reorder
-    localScatterCounts[gid]++;  //tally
-    if (localNodes[i].baseId > 0)
-      localGatherCounts[gid]++;  //tally
-  }
-  free(localMap);
+  memory<parallelNode_t> recvNodes(recvN);
 
-  ogs->localGather.rowStarts  = (dlong*) calloc(ogs->localScatter.Nrows+1,sizeof(dlong));
-  ogs->localScatter.rowStarts = (dlong*) calloc(ogs->localScatter.Nrows+1,sizeof(dlong));
-  for (dlong i=0;i<ogs->localScatter.Nrows;i++) {
-    ogs->localGather.rowStarts[i+1]  = ogs->localGather.rowStarts[i]  + localGatherCounts[i];
-    ogs->localScatter.rowStarts[i+1] = ogs->localScatter.rowStarts[i] + localScatterCounts[i];
+  //Send all the nodes to their destination rank.
+  comm.Alltoallv(    nodes, sendCounts, sendOffsets,
+                 recvNodes, recvCounts, recvOffsets);
 
-    //reset counters
-    localScatterCounts[i] = 0;
-    localGatherCounts[i] = 0;
+  //remember this ordering
+  for (dlong n=0;n<recvN;n++) {
+    recvNodes[n].newId = n;
   }
 
-  ogs->localGather.nnz  = ogs->localGather.rowStarts[ogs->localGather.Nrows];
-  ogs->localScatter.nnz = ogs->localScatter.rowStarts[ogs->localScatter.Nrows];
+  // sort based on base ids
+  sort(recvNodes.ptr(), recvNodes.ptr()+recvN,
+       [](const parallelNode_t& a, const parallelNode_t& b) {
+         return abs(a.baseId) < abs(b.baseId);
+       });
+
+  // We now have a collection of nodes associated with some subset of all global Ids
+  // Our list is sorted by baseId to group nodes with the same globalId together
+  // We now want to flag which nodes are shared via MPI
+
+  int is_unique=1;
+
+  dlong Nshared=0;
+
+  dlong start=0;
+  for (dlong n=0;n<recvN;n++) {
+    if (n==recvN-1 || abs(recvNodes[n].baseId)!=abs(recvNodes[n+1].baseId)) {
+      dlong end = n+1;
+
+      int positiveCount=0;
+      if (unique) {
+        //Make a single node from each baseId group the sole positive node
+        const hlong baseId = abs(recvNodes[start].baseId);
 
-  ogs->localGather.colIds  = (dlong*) calloc(ogs->localGather.nnz+1,sizeof(dlong)); //extra entry so the occa buffer will actually exist
-  ogs->localScatter.colIds = (dlong*) calloc(ogs->localScatter.nnz+1,sizeof(dlong)); //extra entry so the occa buffer will actually exist
-  for (dlong i=0;i<ogs->Nlocal;i++) {
-    dlong gid = localNodes[i].newId;
+        //pick a random node in this group
+        const int m = (rand() % (end-start));
 
-    dlong soffset = ogs->localScatter.rowStarts[gid];
-    int sindex  = localScatterCounts[gid];
-    ogs->localScatter.colIds[soffset+sindex] = localNodes[i].localId;
-    localScatterCounts[gid]++;
+        for (int i=start;i<end;i++)
+          recvNodes[i].baseId = -baseId;
 
-    if (localNodes[i].baseId > 0) {
-      dlong goffset = ogs->localGather.rowStarts[gid];
-      int gindex  = localGatherCounts[gid];
-      ogs->localGather.colIds[goffset+gindex] = localNodes[i].localId;
-      localGatherCounts[gid]++;
+        recvNodes[start+m].baseId = baseId;
+        positiveCount=1;
+      } else {
+        //count how many postive baseIds there are in this group
+        for (int i=start;i<end;i++)
+          if (recvNodes[i].baseId>0) positiveCount++;
+
+        //if we didnt find a sole positive baseId, the gather is not well-defined
+        if (positiveCount!=1) is_unique=0;
+      }
+
+      // When making a halo excahnge, check that we have a leading positive id
+      LIBP_ABORT("Found " << positiveCount << " positive Ids for baseId: "
+                 << abs(recvNodes[start].baseId)<< ".",
+                 kind==Halo && positiveCount!=1);
+
+      //determine if this node is shared via MPI,
+      int shared=1;
+      const int r = recvNodes[start].rank;
+      for (int i=start+1;i<end;i++) {
+        if (recvNodes[i].rank != r) {
+          shared=2;
+          Nshared++;
+          break;
+        }
+      }
+
+      //set shared flag.
+      for (int i=start;i<end;i++) {
+        recvNodes[i].sign = shared;
+      }
+
+      //set new baseId group start point
+      start=n+1;
     }
   }
-  free(localGatherCounts);
-  free(localScatterCounts);
 
-  ogs->localGather.o_rowStarts  = platform.malloc((ogs->localScatter.Nrows+1)*sizeof(dlong), ogs->localGather.rowStarts);
-  ogs->localScatter.o_rowStarts = platform.malloc((ogs->localScatter.Nrows+1)*sizeof(dlong), ogs->localScatter.rowStarts);
+  //shared the unique node check so we know if the gather operation is well-defined
+  comm.Allreduce(is_unique, Comm::Min);
+  gather_defined = (is_unique==1);
 
-  ogs->localGather.o_colIds  = platform.malloc((ogs->localGather.nnz+1)*sizeof(dlong), ogs->localGather.colIds);
-  ogs->localScatter.o_colIds = platform.malloc((ogs->localScatter.nnz+1)*sizeof(dlong), ogs->localScatter.colIds);
+  hlong Nshared_global = Nshared;
+  comm.Reduce(Nshared_global, 0);
+  if (!rank && verbose) {
+    std::cout << "ogs Setup: " << Nshared_global << " unique labels shared." << std::endl;
+  }
 
-  //divide the list of colIds into roughly equal sized blocks so that each
-  // threadblock loads approxiamtely an equal amount of data
-  setupRowBlocks(ogs->localGather, platform);
-  setupRowBlocks(ogs->localScatter, platform);
-
-  free(localNodes);
-
-  //make some compressed versions of the gather/scatter ids for the fused gs kernel
-  ogs->fusedGather.Nrows=0;
-  ogs->fusedScatter.Nrows=0;
-  ogs->symGatherScatter.Nrows=0;
-
-  ogs->fusedGather.nnz=0;
-  ogs->fusedScatter.nnz=0;
-  ogs->symGatherScatter.nnz=0;
-
-  for (dlong n=0;n<ogs->localScatter.Nrows;n++) {
-    int gatherCnt  = ogs->localGather.rowStarts[n+1] -ogs->localGather.rowStarts[n];
-    int scatterCnt = ogs->localScatter.rowStarts[n+1]-ogs->localScatter.rowStarts[n];
-
-    //only include this node if either the gather or scatter interact with mulitple nodes
-    // otherwise the op is identity and ignored
-    if ((gatherCnt>1)||(scatterCnt>1)) {
-      ogs->fusedGather.Nrows++;
-      ogs->fusedScatter.Nrows++;
-      ogs->fusedGather.nnz  += gatherCnt;
-      ogs->fusedScatter.nnz += scatterCnt;
-    }
+  //at this point each collection of baseIds either has all nodes have
+  // sign = 1, meaning all the nodes with this baseId are on the
+  // same rank, or have sign=2, meaning that baseId must be communicated
+
+  // permute recv nodes back to recv'd ordering
+  permute(recvN, recvNodes, [](const parallelNode_t& a) { return a.newId; } );
+
+  //Return all the nodes to their origin rank.
+  comm.Alltoallv(recvNodes, recvCounts, recvOffsets,
+                     nodes, sendCounts, sendOffsets);
+}
+
+void ogsBase_t::ConstructSharedNodes(const dlong Nids,
+                                     memory<parallelNode_t> &nodes,
+                                     dlong &Nshared,
+                                     memory<parallelNode_t> &sharedNodes) {
+
+  int size = comm.size();
+
+  // sort based on abs(baseId)
+  sort(nodes.ptr(), nodes.ptr()+Nids,
+       [](const parallelNode_t& a, const parallelNode_t& b) {
+         if(abs(a.baseId) < abs(b.baseId)) return true; //group by abs(baseId)
+         if(abs(a.baseId) > abs(b.baseId)) return false;
+
+         return a.baseId > b.baseId; //positive ids on a rank first
+       });
+
+  //count how many unique global Ids we have on this rank
+  // and flag baseId groups that have a positive baseId somewhere on this rank
+  dlong NbaseIds=0;
+  NlocalT=0; NlocalP=0;
+  NhaloT=0; NhaloP=0;
+  dlong start=0;
+  for (dlong n=0;n<Nids;n++) {
+    if (n==Nids-1 || abs(nodes[n].baseId)!=abs(nodes[n+1].baseId)) {
+      dlong end = n+1;
+
+      //if there's no leading postive id, flag this baseId group as negative
+      int sign = abs(nodes[start].sign);
+      if (nodes[start].baseId<0) {
+        sign = -sign;
+        for (int i=start;i<end;i++) {
+          nodes[i].sign = sign;
+        }
+      }
+
+      //count the positive/negative local and halo gather nodes
+      if (abs(sign)==1) {
+        NlocalT++;
+        if (sign==1) NlocalP++;
+      } else {
+        NhaloT++;
+        if (sign==2) NhaloP++;
+      }
+
+      //record the new ordering
+      for (int i=start;i<end;i++) {
+        nodes[i].newId=NbaseIds;
+      }
 
-    //for the sym op only the scatter ids are used
-    if (scatterCnt>1) {
-      ogs->symGatherScatter.Nrows++;
-      ogs->symGatherScatter.nnz += scatterCnt;
+      NbaseIds++;
+      start = end;
     }
   }
 
-  ogs->fusedGather.rowStarts  = (dlong*) calloc(ogs->fusedScatter.Nrows+1,sizeof(dlong));
-  ogs->fusedScatter.rowStarts = (dlong*) calloc(ogs->fusedScatter.Nrows+1,sizeof(dlong));
-  ogs->symGatherScatter.rowStarts  = (dlong*) calloc(ogs->symGatherScatter.Nrows+1,sizeof(dlong));
-
-  ogs->fusedGather.colIds  = (dlong*) calloc(ogs->fusedGather.nnz+1,sizeof(dlong));
-  ogs->fusedScatter.colIds  = (dlong*) calloc(ogs->fusedScatter.nnz+1,sizeof(dlong));
-  ogs->symGatherScatter.colIds  = (dlong*) calloc(ogs->symGatherScatter.nnz+1,sizeof(dlong));
-
-  //reset counters
-  ogs->fusedGather.Nrows=0;
-  ogs->fusedScatter.Nrows=0;
-  ogs->symGatherScatter.Nrows=0;
-
-  ogs->fusedGather.nnz=0;
-  ogs->fusedScatter.nnz=0;
-  ogs->symGatherScatter.nnz=0;
-  for (dlong n=0;n<ogs->localScatter.Nrows;n++) {
-    int gatherCnt  = ogs->localGather.rowStarts[n+1] -ogs->localGather.rowStarts[n];
-    int scatterCnt = ogs->localScatter.rowStarts[n+1]-ogs->localScatter.rowStarts[n];
-
-    //only include this node if either the gather and scatter interact with mulitple nodes
-    // otherwise the op is identity and ignored
-    if ((gatherCnt>1)||(scatterCnt>1)) {
-      ogs->fusedGather.Nrows++;
-      ogs->fusedScatter.Nrows++;
-      ogs->fusedGather.rowStarts[ogs->fusedGather.Nrows]   = gatherCnt  + ogs->fusedGather.rowStarts[ogs->fusedGather.Nrows-1];
-      ogs->fusedScatter.rowStarts[ogs->fusedScatter.Nrows] = scatterCnt + ogs->fusedScatter.rowStarts[ogs->fusedScatter.Nrows-1];
-
-      for (int i=ogs->localGather.rowStarts[n];i<ogs->localGather.rowStarts[n+1];i++)
-        ogs->fusedGather.colIds[ogs->fusedGather.nnz++] = ogs->localGather.colIds[i];
-
-      for (int i=ogs->localScatter.rowStarts[n];i<ogs->localScatter.rowStarts[n+1];i++)
-        ogs->fusedScatter.colIds[ogs->fusedScatter.nnz++] = ogs->localScatter.colIds[i];
-    }
+  //total number of positive owned gathered nodes
+  Ngather = NlocalP+NhaloP;
 
-    //for the sym op only the scatter ids are used
-    if (scatterCnt>1) {
-      ogs->symGatherScatter.Nrows++;
-      ogs->symGatherScatter.rowStarts[ogs->symGatherScatter.Nrows] = scatterCnt + ogs->symGatherScatter.rowStarts[ogs->symGatherScatter.Nrows-1];
+  //global total
+  NgatherGlobal = Ngather;
+  comm.Allreduce(NgatherGlobal);
 
-      for (int i=ogs->localScatter.rowStarts[n];i<ogs->localScatter.rowStarts[n+1];i++)
-        ogs->symGatherScatter.colIds[ogs->symGatherScatter.nnz++] = ogs->localScatter.colIds[i];
+  //extract the leading node from each shared baseId
+  memory<parallelNode_t> sendSharedNodes(NhaloT);
+
+  NhaloT=0;
+  for (dlong n=0;n<Nids;n++) {
+    if (n==0 || abs(nodes[n].baseId)!=abs(nodes[n-1].baseId)) {
+      if (abs(nodes[n].sign)==2) {
+        sendSharedNodes[NhaloT++] = nodes[n];
+      }
     }
   }
 
-  ogs->fusedGather.o_rowStarts  = platform.malloc((ogs->fusedScatter.Nrows+1)*sizeof(dlong), ogs->fusedGather.rowStarts);
-  ogs->fusedScatter.o_rowStarts = platform.malloc((ogs->fusedScatter.Nrows+1)*sizeof(dlong), ogs->fusedScatter.rowStarts);
-  ogs->symGatherScatter.o_rowStarts = platform.malloc((ogs->symGatherScatter.Nrows+1)*sizeof(dlong), ogs->symGatherScatter.rowStarts);
+  // permute the list back to local id ordering
+  permute(Nids, nodes, [](const parallelNode_t& a) { return a.localId; } );
+
+  // Use the newId index to reorder the baseId groups based on
+  // the order we encouter them in their original ordering.
+  memory<dlong> indexMap(NbaseIds, -1);
+
+  dlong localCntN = 0, localCntT = NlocalP;  //start point for local gather nodes
+  dlong haloCntN  = 0, haloCntT  = NhaloP;   //start point for halo gather nodes
+  for (dlong n=0;n<Nids;n++) {
+    const dlong newId = nodes[n].newId; //get the new baseId group id
+
+    //record a new index if we've not encoutered this baseId group before
+    if (indexMap[newId]==-1) {
+      if        (nodes[n].sign== 1) {
+        indexMap[newId] = localCntN++;
+      } else if (nodes[n].sign==-1) {
+        indexMap[newId] = localCntT++;
+      } else if (nodes[n].sign== 2) {
+        indexMap[newId] = haloCntN++;
+      } else { //nodes[n].sign==-2
+        indexMap[newId] = haloCntT++;
+      }
+    }
 
-  ogs->fusedGather.o_colIds  = platform.malloc((ogs->fusedGather.nnz+1)*sizeof(dlong), ogs->fusedGather.colIds);
-  ogs->fusedScatter.o_colIds = platform.malloc((ogs->fusedScatter.nnz+1)*sizeof(dlong), ogs->fusedScatter.colIds);
-  ogs->symGatherScatter.o_colIds = platform.malloc((ogs->symGatherScatter.nnz+1)*sizeof(dlong), ogs->symGatherScatter.colIds);
+    const dlong gid = indexMap[newId];
+    nodes[n].newId = gid; //reorder
+  }
 
-  setupRowBlocks(ogs->fusedGather, platform);
-  setupRowBlocks(ogs->fusedScatter, platform);
-  setupRowBlocks(ogs->symGatherScatter, platform);
+  //re-order the shared node list
+  for (dlong n=0;n<NhaloT;n++) {
+    const dlong newId = sendSharedNodes[n].newId; //get the new baseId group id
+    const dlong gid = indexMap[newId];
+    sendSharedNodes[n].localId = gid; //reorder the localId to the compressed order
+  }
 
-  //use the blocking from the fused scatter for the fusded gather as well
-  if (ogs->fusedGather.blockRowStarts) free(ogs->fusedGather.blockRowStarts);
-  ogs->fusedGather.o_blockRowStarts.free();
-  ogs->fusedGather.NrowBlocks = ogs->fusedScatter.NrowBlocks;
-  ogs->fusedGather.blockRowStarts = ogs->fusedScatter.blockRowStarts;
-  ogs->fusedGather.o_blockRowStarts = ogs->fusedScatter.o_blockRowStarts;
+  indexMap.free();
 
-  //set up the halo gatherScatter
-  parallelNode_t *haloNodes = (parallelNode_t*) calloc(ogs->Nhalo+1,sizeof(parallelNode_t));
+  memory<int> sendCounts(size,0);
+  memory<int> recvCounts(size);
+  memory<int> sendOffsets(size+1);
+  memory<int> recvOffsets(size+1);
 
-  cnt=0;
-  for (dlong i=0;i<N;i++) {
-    if (ids[i]==0) continue;
+  // sort based on destination rank
+  sort(sendSharedNodes.ptr(), sendSharedNodes.ptr()+NhaloT,
+       [](const parallelNode_t& a, const parallelNode_t& b) {
+         return a.destRank < b.destRank;
+       });
 
-    if ((minRank[i]!=rank)||(maxRank[i]!=rank)) {
-      haloNodes[cnt].localId = i;
-      haloNodes[cnt].baseId  = ids[i];
-      cnt++;
-    }
+  //count number of ids we're sending
+  for (dlong n=0;n<NhaloT;n++) {
+    sendCounts[sendSharedNodes[n].destRank]++;
   }
 
-  // sort based on base ids (putting positive ids first) then local id
-  std::sort(haloNodes, haloNodes+ogs->Nhalo,
-            [](const parallelNode_t& a, const parallelNode_t& b) {
-              if(abs(a.baseId) < abs(b.baseId)) return true; //group by abs(baseId)
-              if(abs(a.baseId) > abs(b.baseId)) return false;
+  comm.Alltoall(sendCounts, recvCounts);
+
+  sendOffsets[0] = 0;
+  recvOffsets[0] = 0;
+  for (int r=0;r<size;r++) {
+    sendOffsets[r+1] = sendOffsets[r]+sendCounts[r];
+    recvOffsets[r+1] = recvOffsets[r]+recvCounts[r];
+  }
+  dlong recvN = recvOffsets[size]; //total ids to recv
+
+  memory<parallelNode_t> recvSharedNodes(recvN);
+
+  //Send all the nodes to their destination rank.
+  comm.Alltoallv(sendSharedNodes, sendCounts, sendOffsets,
+                 recvSharedNodes, recvCounts, recvOffsets);
+
+  //free up some space
+  sendSharedNodes.free();
+  sendCounts.free();
+  recvCounts.free();
+  sendOffsets.free();
+  recvOffsets.free();
+
+  // sort based on base ids
+  sort(recvSharedNodes.ptr(), recvSharedNodes.ptr()+recvN,
+       [](const parallelNode_t& a, const parallelNode_t& b) {
+         return abs(a.baseId) < abs(b.baseId);
+       });
+
+  //count number of shared nodes we will be sending
+  memory<int> sharedSendCounts(size,0);
+  memory<int> sharedRecvCounts(size);
+  memory<int> sharedSendOffsets(size+1);
+  memory<int> sharedRecvOffsets(size+1);
+
+  start=0;
+  for (dlong n=0;n<recvN;n++) {
+    if (n==recvN-1 || abs(recvSharedNodes[n].baseId)!=abs(recvSharedNodes[n+1].baseId)) {
+      dlong end = n+1;
+
+      for (int i=start;i<end;i++) {
+        //We'll be sending all the shared nodes to each rank involved
+        sharedSendCounts[recvSharedNodes[i].rank] += end-start-1;
+      }
 
-              if(a.baseId > b.baseId) return true; //positive ids first
-              if(a.baseId < b.baseId) return false;
+      //set new baseId group start point
+      start=n+1;
+    }
+  }
 
-              return (a.localId < b.localId); //sort by local id
-            });
+  // Each rank has a set of shared global Ids and for each global id that
+  // rank knows what MPI ranks participate in gathering. We now send this
+  // information to the involved ranks.
 
-  ogs->haloGather.Nrows = 0;
-  ogs->haloScatter.Nrows = 0;
+  //share counts
+  comm.Alltoall(sharedSendCounts, sharedRecvCounts);
 
-  if (ogs->Nhalo) {
-    haloNodes[0].newId = 0;
-    int sign = (haloNodes[0].baseId > 0) ? 1 : -1;
-    haloNodes[0].sign = sign;
-    if (sign > 0) ogs->haloGather.Nrows++;
+  //cumulative sum
+  sharedSendOffsets[0] = 0;
+  sharedRecvOffsets[0] = 0;
+  for (int r=0;r<size;r++) {
+    sharedSendOffsets[r+1] = sharedSendOffsets[r]+sharedSendCounts[r];
+    sharedRecvOffsets[r+1] = sharedRecvOffsets[r]+sharedRecvCounts[r];
+  }
 
-    for (dlong i=1;i<ogs->Nhalo;i++) {
-      if (abs(haloNodes[i].baseId)!=abs(haloNodes[i-1].baseId)) {
-        sign = (haloNodes[i].baseId > 0) ? 1 : -1;
-        ogs->haloScatter.Nrows++;
-        if (sign > 0) ogs->haloGather.Nrows++;
+  //make a send buffer
+  memory<parallelNode_t> sharedSendNodes(sharedSendOffsets[size]);
+
+  //reset sendCounts
+  for (int r=0;r<size;r++) sharedSendCounts[r]=0;
+
+  start=0;
+  for (dlong n=0;n<recvN;n++) {
+    if (n==recvN-1 || abs(recvSharedNodes[n].baseId)!=abs(recvSharedNodes[n+1].baseId)) {
+      dlong end = n+1;
+
+      //build the node list to send
+      for (int i=start;i<end;i++) {
+        const int r = recvSharedNodes[i].rank;
+        const dlong id = recvSharedNodes[i].localId;
+        const int sign = recvSharedNodes[i].sign;
+
+        int sid = sharedSendCounts[r]+sharedSendOffsets[r];
+        for (int j=start;j<end;j++) {
+          if (j==i) continue; //dont bother sending this rank's own node
+          sharedSendNodes[sid] = recvSharedNodes[j];
+          sharedSendNodes[sid].newId = id;
+          sharedSendNodes[sid].sign = sign;
+          sid++;
+        }
+        sharedSendCounts[r] += end-start-1;
       }
 
-      haloNodes[i].newId = ogs->haloScatter.Nrows;
-      haloNodes[i].sign = sign;
+      //set new baseId group start point
+      start=n+1;
     }
-    ogs->haloScatter.Nrows++;
   }
+  recvSharedNodes.free();
+
+  //make sharedNodes to hold the exchange data we recv
+  Nshared = sharedRecvOffsets[size];
+  sharedNodes = memory<parallelNode_t>(Nshared);
+
+  //Share all the gathering info
+  comm.Alltoallv(sharedSendNodes, sharedSendCounts, sharedSendOffsets,
+                     sharedNodes, sharedRecvCounts, sharedRecvOffsets);
+}
+
+//Make local and halo gather operators using nodes list
+void ogsBase_t::LocalSignedSetup(const dlong Nids, memory<parallelNode_t> &nodes){
+
+  gatherLocal = std::make_shared<ogsOperator_t>(platform);
+  gatherHalo  = std::make_shared<ogsOperator_t>(platform);
+
+  gatherLocal->kind = Signed;
+  gatherHalo->kind = Signed;
+
+  gatherLocal->Ncols = N;
+  gatherHalo->Ncols = N;
 
-  // sort based on local ids
-  std::sort(haloNodes, haloNodes+ogs->Nhalo,
-            [](const parallelNode_t& a, const parallelNode_t& b) {
-              return (a.localId < b.localId); //sort by local id
-            });
+  gatherLocal->NrowsN = NlocalP;
+  gatherLocal->NrowsT = NlocalT;
+  gatherHalo->NrowsN = NhaloP;
+  gatherHalo->NrowsT = NhaloT;
 
   //tally up how many nodes are being gathered to each gatherNode and
   //  map to a local ordering
-  dlong *haloGatherCounts  = (dlong*) calloc(ogs->haloGather.Nrows+1,sizeof(dlong));
-  dlong *haloScatterCounts = (dlong*) calloc(ogs->haloScatter.Nrows+1,sizeof(dlong));
-  dlong *haloMap = (dlong*)  calloc(ogs->haloScatter.Nrows+1,sizeof(dlong));
-  hlong *haloIds = (hlong *) calloc(ogs->haloScatter.Nrows+1,sizeof(hlong));
-  hlong *haloIdsSym = (hlong *) calloc(ogs->haloScatter.Nrows+1,sizeof(hlong));
-
-  for (dlong i=0;i<ogs->haloScatter.Nrows;i++) haloMap[i] = -1; //initialize map
-
-  cnt = 0;
-  cnt2 = ogs->haloGather.Nrows;
-  for (dlong i=0;i<ogs->Nhalo;i++) {
-    dlong newId = haloNodes[i].newId; //get the ordered id
-
-    if (haloMap[newId] == -1) {
-      if (haloNodes[i].sign > 0)
-        haloMap[newId] = cnt++;
-      else
-        haloMap[newId] = cnt2++;
-
-      //record the base id of the gathered node
-      haloIds[haloMap[newId]] = haloNodes[i].sign*abs(haloNodes[i].baseId);
-      haloIdsSym[haloMap[newId]] = abs(haloNodes[i].baseId);
+  memory<dlong> localGatherNCounts(gatherLocal->NrowsT,0);
+  memory<dlong> localGatherTCounts(gatherLocal->NrowsT,0);
+  memory<dlong> haloGatherNCounts(gatherHalo->NrowsT,0);
+  memory<dlong> haloGatherTCounts(gatherHalo->NrowsT,0);
+
+  for (dlong i=0;i<Nids;i++) {
+    const dlong gid = nodes[i].newId; //re-mapped baseId on this rank
+
+    if (abs(nodes[i].sign)==1) { //local
+      if (nodes[i].baseId>0) localGatherNCounts[gid]++;  //tally
+      localGatherTCounts[gid]++;  //tally
+    } else { //halo
+      if (nodes[i].baseId>0) haloGatherNCounts[gid]++;  //tally
+      haloGatherTCounts[gid]++;  //tally
     }
-
-    dlong gid = haloMap[newId];
-    haloNodes[i].newId = gid;  //reorder
-    haloScatterCounts[gid]++;  //tally
-    if (haloNodes[i].baseId>0)
-      haloGatherCounts[gid]++;  //tally
   }
-  free(haloMap);
 
-  ogs->haloGather.rowStarts  = (dlong*) calloc(ogs->haloGather.Nrows+1,sizeof(dlong));
-  ogs->haloScatter.rowStarts = (dlong*) calloc(ogs->haloScatter.Nrows+1,sizeof(dlong));
-  for (dlong i=0;i<ogs->haloGather.Nrows;i++) {
-    ogs->haloGather.rowStarts[i+1] = ogs->haloGather.rowStarts[i] + haloGatherCounts[i];
-    haloGatherCounts[i] = 0;
+  //make local row offsets
+  gatherLocal->rowStartsN.malloc(gatherLocal->NrowsT+1);
+  gatherLocal->rowStartsT.malloc(gatherLocal->NrowsT+1);
+  gatherLocal->rowStartsN[0] = 0;
+  gatherLocal->rowStartsT[0] = 0;
+  for (dlong i=0;i<gatherLocal->NrowsT;i++) {
+    gatherLocal->rowStartsN[i+1] = gatherLocal->rowStartsN[i] + localGatherNCounts[i];
+    gatherLocal->rowStartsT[i+1] = gatherLocal->rowStartsT[i] + localGatherTCounts[i];
+    localGatherNCounts[i] = 0; //reset counters
+    localGatherTCounts[i] = 0; //reset counters
   }
-  for (dlong i=0;i<ogs->haloScatter.Nrows;i++) {
-    ogs->haloScatter.rowStarts[i+1] = ogs->haloScatter.rowStarts[i] + haloScatterCounts[i];
-    haloScatterCounts[i] = 0;
+  gatherLocal->nnzN = gatherLocal->rowStartsN[gatherLocal->NrowsT];
+  gatherLocal->nnzT = gatherLocal->rowStartsT[gatherLocal->NrowsT];
+  gatherLocal->colIdsN.malloc(gatherLocal->nnzN);
+  gatherLocal->colIdsT.malloc(gatherLocal->nnzT);
+
+  //make halo row offsets
+  gatherHalo->rowStartsN.malloc(gatherHalo->NrowsT+1);
+  gatherHalo->rowStartsT.malloc(gatherHalo->NrowsT+1);
+  gatherHalo->rowStartsN[0] = 0;
+  gatherHalo->rowStartsT[0] = 0;
+  for (dlong i=0;i<gatherHalo->NrowsT;i++) {
+    gatherHalo->rowStartsN[i+1] = gatherHalo->rowStartsN[i] + haloGatherNCounts[i];
+    gatherHalo->rowStartsT[i+1] = gatherHalo->rowStartsT[i] + haloGatherTCounts[i];
+    haloGatherNCounts[i] = 0;
+    haloGatherTCounts[i] = 0;
   }
+  gatherHalo->nnzN = gatherHalo->rowStartsN[gatherHalo->NrowsT];
+  gatherHalo->nnzT = gatherHalo->rowStartsT[gatherHalo->NrowsT];
+  gatherHalo->colIdsN.malloc(gatherHalo->nnzN);
+  gatherHalo->colIdsT.malloc(gatherHalo->nnzT);
 
-  ogs->haloGather.nnz  = ogs->haloGather.rowStarts[ogs->haloGather.Nrows];
-  ogs->haloScatter.nnz = ogs->haloScatter.rowStarts[ogs->haloScatter.Nrows];
-
-  ogs->haloGather.colIds  = (dlong*) calloc(ogs->haloGather.nnz+1,sizeof(dlong));
-  ogs->haloScatter.colIds = (dlong*) calloc(ogs->haloScatter.nnz+1,sizeof(dlong));
-  for (dlong i=0;i<ogs->Nhalo;i++) {
-    dlong gid = haloNodes[i].newId;
 
-    dlong soffset = ogs->haloScatter.rowStarts[gid];
-    int sindex  = haloScatterCounts[gid];
-    ogs->haloScatter.colIds[soffset+sindex] = haloNodes[i].localId;
-    haloScatterCounts[gid]++;
+  for (dlong i=0;i<Nids;i++) {
+    const dlong gid = nodes[i].newId;
 
-    if (haloNodes[i].baseId > 0) {
-      dlong goffset = ogs->haloGather.rowStarts[gid];
-      int gindex  = haloGatherCounts[gid];
-      ogs->haloGather.colIds[goffset+gindex] = haloNodes[i].localId;
-      haloGatherCounts[gid]++;
+    if (abs(nodes[i].sign)==1) { //local gather group
+      if (nodes[i].baseId>0) {
+        const dlong soffset = gatherLocal->rowStartsN[gid];
+        const int sindex  = localGatherNCounts[gid];
+        gatherLocal->colIdsN[soffset+sindex] = nodes[i].localId;
+        localGatherNCounts[gid]++;
+      }
+      const dlong soffset = gatherLocal->rowStartsT[gid];
+      const int sindex  = localGatherTCounts[gid];
+      gatherLocal->colIdsT[soffset+sindex] = nodes[i].localId;
+      localGatherTCounts[gid]++;
+    } else {
+      if (nodes[i].baseId>0) {
+        const dlong soffset = gatherHalo->rowStartsN[gid];
+        const int sindex  = haloGatherNCounts[gid];
+        gatherHalo->colIdsN[soffset+sindex] = nodes[i].localId;
+        haloGatherNCounts[gid]++;
+      }
+      const dlong soffset = gatherHalo->rowStartsT[gid];
+      const int sindex  = haloGatherTCounts[gid];
+      gatherHalo->colIdsT[soffset+sindex] = nodes[i].localId;
+      haloGatherTCounts[gid]++;
     }
   }
-  free(haloGatherCounts);
-  free(haloScatterCounts);
-
-  ogs->haloGather.o_rowStarts  = platform.malloc((ogs->haloGather.Nrows+1)*sizeof(dlong), ogs->haloGather.rowStarts);
-  ogs->haloScatter.o_rowStarts = platform.malloc((ogs->haloScatter.Nrows+1)*sizeof(dlong), ogs->haloScatter.rowStarts);
+  localGatherNCounts.free();
+  localGatherTCounts.free();
+  haloGatherNCounts.free();
+  haloGatherTCounts.free();
 
-  ogs->haloGather.o_colIds  = platform.malloc((ogs->haloGather.nnz+1)*sizeof(dlong), ogs->haloGather.colIds);
-  ogs->haloScatter.o_colIds = platform.malloc((ogs->haloScatter.nnz+1)*sizeof(dlong), ogs->haloScatter.colIds);
+  gatherLocal->o_rowStartsN = platform.malloc(gatherLocal->rowStartsN);
+  gatherLocal->o_rowStartsT = platform.malloc(gatherLocal->rowStartsT);
+  gatherLocal->o_colIdsN = platform.malloc(gatherLocal->colIdsN);
+  gatherLocal->o_colIdsT = platform.malloc(gatherLocal->colIdsT);
 
-  setupRowBlocks(ogs->haloGather, platform);
-  setupRowBlocks(ogs->haloScatter, platform);
+  gatherHalo->o_rowStartsN = platform.malloc(gatherHalo->rowStartsN);
+  gatherHalo->o_rowStartsT = platform.malloc(gatherHalo->rowStartsT);
+  gatherHalo->o_colIdsN = platform.malloc(gatherHalo->colIdsN);
+  gatherHalo->o_colIdsT = platform.malloc(gatherHalo->colIdsT);
 
-  free(haloNodes);
+  //divide the list of colIds into roughly equal sized blocks so that each
+  // threadblock loads approximately an equal amount of data
+  gatherLocal->setupRowBlocks();
+  gatherHalo->setupRowBlocks();
+}
 
-  //make a host gs handle
-  ogs->Nlocal = ogs->localScatter.Nrows;
-  ogs->Nhalo = ogs->haloScatter.Nrows;
-  ogs->gsh    = ogs::gsSetup(comm, ogs->Nhalo, haloIds, 0,0);
-  ogs->gshSym = ogs::gsSetup(comm, ogs->Nhalo, haloIdsSym, 0,0);
+//Make local and halo gather operators using nodes list
+void ogsBase_t::LocalUnsignedSetup(const dlong Nids, memory<parallelNode_t> &nodes){
 
-  free(haloIds);
-  free(haloIdsSym);
+  gatherLocal = std::make_shared<ogsOperator_t>(platform);
+  gatherHalo  = std::make_shared<ogsOperator_t>(platform);
 
-  free(minRank); free(maxRank);
+  gatherLocal->kind = Unsigned;
+  gatherHalo->kind = Unsigned;
 
-  //total number of owned gathered nodes
-  ogs->Ngather = ogs->localGather.Nrows+ogs->haloGather.Nrows;
+  gatherLocal->Ncols = N;
+  gatherHalo->Ncols = N;
 
-  //total size of halo for gathered array
-  ogs->NgatherHalo = ogs->haloScatter.Nrows-ogs->haloGather.Nrows;
+  gatherLocal->NrowsN = NlocalP;
+  gatherLocal->NrowsT = NlocalT;
+  gatherHalo->NrowsN = NhaloP;
+  gatherHalo->NrowsT = NhaloT;
 
-  hlong NgatherLocal = (hlong) ogs->Ngather;
-  MPI_Allreduce(&NgatherLocal, &(ogs->NgatherGlobal), 1, MPI_HLONG, MPI_SUM, comm);
+  //tally up how many nodes are being gathered to each gatherNode and
+  //  map to a local ordering
+  memory<dlong> localGatherTCounts(gatherLocal->NrowsT,0);
+  memory<dlong> haloGatherTCounts(gatherHalo->NrowsT,0);
 
-  ogs->hostBuf = nullptr;
-  ogs->haloBuf = nullptr;
-  ogs->hostBufSize = 0;
+  for (dlong i=0;i<Nids;i++) {
+    const dlong gid = nodes[i].newId; //re-mapped baseId on this rank
 
-  return ogs;
-}
+    if (abs(nodes[i].sign)==1) { //local
+      localGatherTCounts[gid]++;  //tally
+    } else { //halo
+      haloGatherTCounts[gid]++;  //tally
+    }
+  }
 
-void ogs_t::Free() {
+  //make local row offsets
+  gatherLocal->rowStartsT.malloc(gatherLocal->NrowsT+1);
+  gatherLocal->rowStartsN = gatherLocal->rowStartsT;
+  gatherLocal->rowStartsT[0] = 0;
+  for (dlong i=0;i<gatherLocal->NrowsT;i++) {
+    gatherLocal->rowStartsT[i+1] = gatherLocal->rowStartsT[i] + localGatherTCounts[i];
+    localGatherTCounts[i] = 0; //reset counters
+  }
+  gatherLocal->nnzT = gatherLocal->rowStartsT[gatherLocal->NrowsT];
+  gatherLocal->nnzN = gatherLocal->nnzT;
+  gatherLocal->colIdsT.malloc(gatherLocal->nnzT);
+  gatherLocal->colIdsN = gatherLocal->colIdsT;
+
+  //make halo row offsets
+  gatherHalo->rowStartsT.malloc(gatherHalo->NrowsT+1);
+  gatherHalo->rowStartsN = gatherHalo->rowStartsT;
+  gatherHalo->rowStartsT[0] = 0;
+  for (dlong i=0;i<gatherHalo->NrowsT;i++) {
+    gatherHalo->rowStartsT[i+1] = gatherHalo->rowStartsT[i] + haloGatherTCounts[i];
+    haloGatherTCounts[i] = 0;
+  }
+  gatherHalo->nnzT = gatherHalo->rowStartsT[gatherHalo->NrowsT];
+  gatherHalo->nnzN = gatherHalo->nnzT;
+  gatherHalo->colIdsT.malloc(gatherHalo->nnzT);
+  gatherHalo->colIdsN = gatherHalo->colIdsT;
 
-  ogs::gsFree(gsh);
 
-  ogs::Nrefs--;
-  if (!ogs::Nrefs) ogs::freeKernels();
-}
+  for (dlong i=0;i<Nids;i++) {
+    const dlong gid = nodes[i].newId;
 
-void ogs_t::reallocHostBuffer(size_t Nbytes) {
-  if (Nhalo) {
-    if (hostBufSize < Nhalo*Nbytes) {
-      if (hostBufSize) free(hostBuf);
-      hostBuf = (void *) malloc(Nhalo*Nbytes);
-      hostBufSize = Nhalo*Nbytes;
+    if (abs(nodes[i].sign)==1) { //local gather group
+      const dlong soffset = gatherLocal->rowStartsT[gid];
+      const int sindex  = localGatherTCounts[gid];
+      gatherLocal->colIdsT[soffset+sindex] = nodes[i].localId;
+      localGatherTCounts[gid]++;
+    } else {
+      const dlong soffset = gatherHalo->rowStartsT[gid];
+      const int sindex  = haloGatherTCounts[gid];
+      gatherHalo->colIdsT[soffset+sindex] = nodes[i].localId;
+      haloGatherTCounts[gid]++;
     }
   }
-}
+  localGatherTCounts.free();
+  haloGatherTCounts.free();
 
-void ogs_t::reallocOccaBuffer(size_t Nbytes) {
-  if (Nhalo) {
-    if (o_haloBuf.size() < Nhalo*Nbytes) {
-      if (o_haloBuf.size()) o_haloBuf.free();
-      haloBuf = platform.hostMalloc(Nhalo*Nbytes, nullptr, h_haloBuf);
-      o_haloBuf = platform.malloc(Nhalo*Nbytes);
-    }
-  }
+  gatherLocal->o_rowStartsT = platform.malloc(gatherLocal->rowStartsT);
+  gatherLocal->o_rowStartsN = gatherLocal->o_rowStartsT;
+  gatherLocal->o_colIdsT = platform.malloc(gatherLocal->colIdsT);
+  gatherLocal->o_colIdsN = gatherLocal->o_colIdsT;
+
+  gatherHalo->o_rowStartsT = platform.malloc(gatherHalo->rowStartsT);
+  gatherHalo->o_rowStartsN = gatherHalo->o_rowStartsT;
+  gatherHalo->o_colIdsT = platform.malloc(gatherHalo->colIdsT);
+  gatherHalo->o_colIdsN = gatherHalo->o_colIdsT;
+
+  //divide the list of colIds into roughly equal sized blocks so that each
+  // threadblock loads approximately an equal amount of data
+  gatherLocal->setupRowBlocks();
+  gatherHalo->setupRowBlocks();
 }
 
-void setupRowBlocks(ogsData_t &A, platform_t &platform) {
+//Make local and halo gather operators using nodes list
+void ogsBase_t::LocalHaloSetup(const dlong Nids, memory<parallelNode_t> &nodes){
 
-  dlong blockSum=0;
-  A.NrowBlocks=0;
-  if (A.Nrows) A.NrowBlocks++;
-  for (dlong i=0;i<A.Nrows;i++) {
-    dlong rowSize = A.rowStarts[i+1]-A.rowStarts[i];
+  gatherHalo  = std::make_shared<ogsOperator_t>(platform);
+  gatherHalo->kind = Signed;
 
-    if (rowSize > ogs::gatherNodesPerBlock) {
-      //this row is pathalogically big. We can't currently run this
-      stringstream ss;
-      ss << "Multiplicity of global node id: " << i << "in ogsSetup is too large.";
-      LIBP_ABORT(ss.str())
-    }
+  gatherHalo->Ncols = N;
 
-    if (blockSum+rowSize > ogs::gatherNodesPerBlock) { //adding this row will exceed the nnz per block
-      A.NrowBlocks++; //count the previous block
-      blockSum=rowSize; //start a new row block
-    } else {
-      blockSum+=rowSize; //add this row to the block
+  gatherHalo->NrowsN = NhaloP;
+  gatherHalo->NrowsT = NhaloT;
+
+  //tally up how many nodes are being gathered to each gatherNode and
+  //  map to a local ordering
+  memory<dlong> haloGatherNCounts(gatherHalo->NrowsT,0);
+  memory<dlong> haloGatherTCounts(gatherHalo->NrowsT,0);
+
+  for (dlong i=0;i<Nids;i++) {
+    const dlong gid = nodes[i].newId; //re-mapped baseId on this rank
+
+    if (abs(nodes[i].sign)==2) {//halo
+      if (nodes[i].sign==2) haloGatherNCounts[gid]++;  //tally
+      haloGatherTCounts[gid]++;  //tally
     }
   }
 
-  A.blockRowStarts  = (dlong*) calloc(A.NrowBlocks+1,sizeof(dlong));
+  //make halo row offsets
+  gatherHalo->rowStartsN.malloc(gatherHalo->NrowsT+1);
+  gatherHalo->rowStartsT.malloc(gatherHalo->NrowsT+1);
+  gatherHalo->rowStartsN[0]=0;
+  gatherHalo->rowStartsT[0]=0;
+  for (dlong i=0;i<gatherHalo->NrowsT;i++) {
+    gatherHalo->rowStartsN[i+1] = gatherHalo->rowStartsN[i] + haloGatherNCounts[i];
+    gatherHalo->rowStartsT[i+1] = gatherHalo->rowStartsT[i] + haloGatherTCounts[i];
+    haloGatherNCounts[i] = 0;
+    haloGatherTCounts[i] = 0;
+  }
+  gatherHalo->nnzN = gatherHalo->rowStartsN[gatherHalo->NrowsT];
+  gatherHalo->nnzT = gatherHalo->rowStartsT[gatherHalo->NrowsT];
+  gatherHalo->colIdsN.malloc(gatherHalo->nnzN);
+  gatherHalo->colIdsT.malloc(gatherHalo->nnzT);
 
-  blockSum=0;
-  A.NrowBlocks=0;
-  if (A.Nrows) A.NrowBlocks++;
-  for (dlong i=0;i<A.Nrows;i++) {
-    dlong rowSize = A.rowStarts[i+1]-A.rowStarts[i];
 
-    if (blockSum+rowSize > ogs::gatherNodesPerBlock) { //adding this row will exceed the nnz per block
-      A.blockRowStarts[A.NrowBlocks++] = i; //mark the previous block
-      blockSum=rowSize; //start a new row block
-    } else {
-      blockSum+=rowSize; //add this row to the block
+  for (dlong i=0;i<Nids;i++) {
+    const dlong gid = nodes[i].newId;
+
+    if (abs(nodes[i].sign)==2) {
+      if (nodes[i].sign==2) {
+        const dlong soffset = gatherHalo->rowStartsN[gid];
+        const int sindex  = haloGatherNCounts[gid];
+        gatherHalo->colIdsN[soffset+sindex] = nodes[i].localId;
+        haloGatherNCounts[gid]++;
+      }
+      const dlong soffset = gatherHalo->rowStartsT[gid];
+      const int sindex  = haloGatherTCounts[gid];
+      gatherHalo->colIdsT[soffset+sindex] = nodes[i].localId;
+      haloGatherTCounts[gid]++;
     }
   }
-  A.blockRowStarts[A.NrowBlocks] = A.Nrows;
+  haloGatherNCounts.free();
+  haloGatherTCounts.free();
+
+  gatherHalo->o_rowStartsN = platform.malloc(gatherHalo->rowStartsN);
+  gatherHalo->o_rowStartsT = platform.malloc(gatherHalo->rowStartsT);
+  gatherHalo->o_colIdsN = platform.malloc(gatherHalo->colIdsN);
+  gatherHalo->o_colIdsT = platform.malloc(gatherHalo->colIdsT);
+
+  //divide the list of colIds into roughly equal sized blocks so that each
+  // threadblock loads approximately an equal amount of data
+  gatherHalo->setupRowBlocks();
+}
+
+void ogsBase_t::Free() {
+  comm.Free();
+  gatherLocal = nullptr;
+  gatherHalo = nullptr;
+  exchange = nullptr;
+  N=0;
+  NlocalT=0;
+  NhaloT=0;
+  Ngather=0;
+  NgatherGlobal=0;
+}
+
+void ogsBase_t::AssertGatherDefined() {
+  LIBP_ABORT("Gather operation not well-defined.",
+             !gather_defined);
+}
+
+//Populate the local mapping of the original ids and the gathered ordering
+void ogs_t::SetupGlobalToLocalMapping(memory<dlong> GlobalToLocal) {
+
+  LIBP_ABORT("ogs handle is not set up.",
+             NgatherGlobal==0);
+
+  //Note: Must have GlobalToLocal have N entries.
+
+  memory<dlong> ids(NlocalT+NhaloT);
+
+  for (dlong n=0;n<NlocalT+NhaloT;n++)
+    ids[n] = n;
+
+  for (dlong n=0;n<N;n++)
+    GlobalToLocal[n] = -1;
+
+  gatherLocal->Scatter(GlobalToLocal, ids,
+                       1, NoTrans);
+  gatherHalo->Scatter(GlobalToLocal, ids+NlocalT,
+                       1, NoTrans);
+}
+
+void halo_t::SetupFromGather(ogs_t& ogs) {
+
+  ogs.AssertGatherDefined();
+
+  platform = ogs.platform;
+  comm = ogs.comm;
+
+  N = ogs.NlocalT + ogs.NhaloT;
+
+  Ngather = Ngather;
+  Nhalo = ogs.NhaloT - ogs.NhaloP;
+
+  NgatherGlobal = ogs.NgatherGlobal;
+
+  kind = Halo;
+  unique = ogs.unique;
+
+  NlocalP = ogs.NlocalP;
+  NlocalT  = ogs.NlocalT;
+
+  NhaloP = ogs.NhaloP;
+  NhaloT  = ogs.NhaloT;
+
+  gather_defined=false;
+
+  gathered_halo=true;
+
+  exchange = ogs.exchange;
+}
+
+} //namespace ogs
 
-  A.o_blockRowStarts = platform.malloc((A.NrowBlocks+1)*sizeof(dlong), A.blockRowStarts);
-}
\ No newline at end of file
+} //namespace libp
diff --git a/libs/ogs/ogsUtils.cpp b/libs/ogs/ogsUtils.cpp
new file mode 100644
index 000000000..69c469a02
--- /dev/null
+++ b/libs/ogs/ogsUtils.cpp
@@ -0,0 +1,127 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include <limits>
+#include "ogs.hpp"
+#include "ogs/ogsOperator.hpp"
+#include "ogs/ogsExchange.hpp"
+#include "ogs/ogsUtils.hpp"
+
+namespace libp {
+
+namespace ogs {
+
+stream_t ogsBase_t::dataStream;
+
+kernel_t ogsOperator_t::gatherScatterKernel[4][4];
+kernel_t ogsOperator_t::gatherKernel[4][4];
+kernel_t ogsOperator_t::scatterKernel[4];
+
+kernel_t ogsExchange_t::extractKernel[4];
+
+
+void InitializeKernels(platform_t& platform, const Type type, const Op op) {
+
+  //check if the gather kernel is initialized
+  if (!ogsOperator_t::gatherKernel[type][op].isInitialized()) {
+
+    properties_t kernelInfo = platform.props();
+
+    kernelInfo["defines/p_blockSize"] = ogsOperator_t::blockSize;
+    kernelInfo["defines/p_gatherNodesPerBlock"] = ogsOperator_t::gatherNodesPerBlock;
+
+    switch (type) {
+      case Float:  kernelInfo["defines/T"] =  "float"; break;
+      case Double: kernelInfo["defines/T"] =  "double"; break;
+      case Int32:  kernelInfo["defines/T"] =  "int32_t"; break;
+      case Int64:  kernelInfo["defines/T"] =  "int64_t"; break;
+    }
+
+    switch (type) {
+      case Float:
+        switch (op) {
+          case Add: kernelInfo["defines/OGS_OP_INIT"] =  float{0}; break;
+          case Mul: kernelInfo["defines/OGS_OP_INIT"] =  float{1}; break;
+          case Min: kernelInfo["defines/OGS_OP_INIT"] =  std::numeric_limits<float>::max(); break;
+          case Max: kernelInfo["defines/OGS_OP_INIT"] = -std::numeric_limits<float>::max(); break;
+        }
+        break;
+      case Double:
+        switch (op) {
+          case Add: kernelInfo["defines/OGS_OP_INIT"] =  double{0}; break;
+          case Mul: kernelInfo["defines/OGS_OP_INIT"] =  double{1}; break;
+          case Min: kernelInfo["defines/OGS_OP_INIT"] =  std::numeric_limits<double>::max(); break;
+          case Max: kernelInfo["defines/OGS_OP_INIT"] = -std::numeric_limits<double>::max(); break;
+        }
+        break;
+      case Int32:
+        switch (op) {
+          case Add: kernelInfo["defines/OGS_OP_INIT"] =  int32_t{0}; break;
+          case Mul: kernelInfo["defines/OGS_OP_INIT"] =  int32_t{1}; break;
+          case Min: kernelInfo["defines/OGS_OP_INIT"] =  std::numeric_limits<int32_t>::max(); break;
+          case Max: kernelInfo["defines/OGS_OP_INIT"] = -std::numeric_limits<int32_t>::max(); break;
+        }
+        break;
+      case Int64:
+        switch (op) {
+          case Add: kernelInfo["defines/OGS_OP_INIT"] =  int64_t{0}; break;
+          case Mul: kernelInfo["defines/OGS_OP_INIT"] =  int64_t{1}; break;
+          case Min: kernelInfo["defines/OGS_OP_INIT"] =  std::numeric_limits<int64_t>::max(); break;
+          case Max: kernelInfo["defines/OGS_OP_INIT"] = -std::numeric_limits<int64_t>::max(); break;
+        }
+        break;
+    }
+
+    switch (op) {
+      case Add: kernelInfo["defines/OGS_OP(a,b)"] = "a+=b"; break;
+      case Mul: kernelInfo["defines/OGS_OP(a,b)"] = "a*=b"; break;
+      case Min: kernelInfo["defines/OGS_OP(a,b)"] = "if(b<a) a=b"; break;
+      case Max: kernelInfo["defines/OGS_OP(a,b)"] = "if(b>a) a=b"; break;
+    }
+
+    ogsOperator_t::gatherScatterKernel[type][op] = platform.buildKernel(OGS_DIR "/okl/ogsKernels.okl",
+                                                         "gatherScatter",
+                                                         kernelInfo);
+
+
+    ogsOperator_t::gatherKernel[type][op] = platform.buildKernel(OGS_DIR "/okl/ogsKernels.okl",
+                                                "gather",
+                                                kernelInfo);
+
+    if (!ogsOperator_t::scatterKernel[type].isInitialized()) {
+      ogsOperator_t::scatterKernel[type] = platform.buildKernel(OGS_DIR "/okl/ogsKernels.okl",
+                                                 "scatter",
+                                                 kernelInfo);
+
+      ogsExchange_t::extractKernel[type] = platform.buildKernel(OGS_DIR "/okl/ogsKernels.okl",
+                                                "extract", kernelInfo);\
+    }
+  }
+}
+
+} //namespace ogs
+
+} //namespace libp
diff --git a/libs/ogs/okl/gatherScatter.okl b/libs/ogs/okl/gatherScatter.okl
deleted file mode 100644
index 6d80f65e4..000000000
--- a/libs/ogs/okl/gatherScatter.okl
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-// OCCA will #include "ogsDefs.h" before compilation
-
-/*------------------------------------------------------------------------------
-  The basic gather-scatter kernel
-------------------------------------------------------------------------------*/
-#define DEFINE_GATHERSCATTER(T,OP)                                              \
-@kernel void gatherScatter_##T##_##OP(const dlong Nblocks,                      \
-                                      const int   Nentries,                     \
-                                      const int   Nvectors,                     \
-                                      const dlong stride,                       \
-                                      @restrict const dlong *blockStarts,       \
-                                      @restrict const dlong *gatherStarts,      \
-                                      @restrict const dlong *gatherIds,         \
-                                      @restrict const dlong *scatterStarts,     \
-                                      @restrict const dlong *scatterIds,        \
-                                      @restrict           T *q)                 \
-{                                                                               \
-  for(dlong m=0;m<Nvectors;++m;@outer(2)){                                      \
-    for(dlong k=0;k<Nentries;++k;@outer(1)){                                    \
-      for(dlong b=0;b<Nblocks;++b;@outer(0)){                                   \
-        @exclusive dlong blockStart, blockEnd, gStart, sStart;                  \
-        @shared T gtemp[p_gatherNodesPerBlock];                                 \
-        @shared T stemp[p_gatherNodesPerBlock];                                 \
-                                                                                \
-        for(dlong n=0;n<p_blockSize;++n;@inner){                                \
-          blockStart = blockStarts[b];                                          \
-          blockEnd   = blockStarts[b+1];                                        \
-          gStart = gatherStarts[blockStart];                                    \
-          sStart = scatterStarts[blockStart];                                   \
-                                                                                \
-          for (dlong id=gStart+n;id<gatherStarts[blockEnd];id+=p_blockSize) {   \
-            gtemp[id-gStart] = q[k+gatherIds[id]*Nentries+m*stride];            \
-          }                                                                     \
-        }                                                                       \
-                                                                                \
-        for(dlong n=0;n<p_blockSize;++n;@inner){                                \
-          for (dlong row=blockStart+n;row<blockEnd;row+=p_blockSize) {          \
-            const dlong gRowStart = gatherStarts[row]  -gStart;                 \
-            const dlong gRowEnd   = gatherStarts[row+1]-gStart;                 \
-            const dlong sRowStart = scatterStarts[row]  -sStart;                \
-            const dlong sRowEnd   = scatterStarts[row+1]-sStart;                \
-            T gq = init_##T##_##OP;                                             \
-            for (dlong i=gRowStart;i<gRowEnd;i++) {                             \
-              OGS_DO_##OP(gq,gtemp[i]);                                         \
-            }                                                                   \
-            for (dlong i=sRowStart;i<sRowEnd;i++) {                             \
-              stemp[i] = gq;                                                    \
-            }                                                                   \
-          }                                                                     \
-        }                                                                       \
-                                                                                \
-        for(dlong n=0;n<p_blockSize;++n;@inner){                                \
-          for (dlong id=sStart+n;id<scatterStarts[blockEnd];id+=p_blockSize) {  \
-            q[k+scatterIds[id]*Nentries+m*stride] = stemp[id-sStart];           \
-          }                                                                     \
-        }                                                                       \
-      }                                                                         \
-    }                                                                           \
-  }                                                                             \
-}
-
-/*------------------------------------------------------------------------------
-  The basic gather kernel
-------------------------------------------------------------------------------*/
-#define DEFINE_GATHER(T,OP)                                                     \
-@kernel void gather_##T##_##OP(const dlong Nblocks,                             \
-                               const int   Nentries,                            \
-                               const int   Nvectors,                            \
-                               const dlong stride,                              \
-                               const dlong gstride,                             \
-                               @restrict const dlong *blockStarts,              \
-                               @restrict const dlong *gatherStarts,             \
-                               @restrict const dlong *gatherIds,                \
-                               @restrict const     T *q,                        \
-                               @restrict           T *gatherq)                  \
-{                                                                               \
-  for(dlong m=0;m<Nvectors;++m;@outer(2)){                                      \
-    for(dlong k=0;k<Nentries;++k;@outer(1)){                                    \
-      for(dlong b=0;b<Nblocks;++b;@outer(0)){                                   \
-        @exclusive dlong blockStart, blockEnd, start;                           \
-        @shared T temp[p_gatherNodesPerBlock];                                  \
-                                                                                \
-        for(dlong n=0;n<p_blockSize;++n;@inner){                                \
-          blockStart = blockStarts[b];                                          \
-          blockEnd   = blockStarts[b+1];                                        \
-          start = gatherStarts[blockStart];                                     \
-                                                                                \
-          for (dlong id=start+n;id<gatherStarts[blockEnd];id+=p_blockSize) {    \
-            temp[id-start] = q[k+gatherIds[id]*Nentries+m*stride];              \
-          }                                                                     \
-        }                                                                       \
-                                                                                \
-        for(dlong n=0;n<p_blockSize;++n;@inner){                                \
-          for (dlong row=blockStart+n;row<blockEnd;row+=p_blockSize) {          \
-            const dlong rowStart = gatherStarts[row]  -start;                   \
-            const dlong rowEnd   = gatherStarts[row+1]-start;                   \
-            T gq = init_##T##_##OP;                                             \
-            for (dlong i=rowStart;i<rowEnd;i++) {                               \
-              OGS_DO_##OP(gq,temp[i]);                                          \
-            }                                                                   \
-            gatherq[k+row*Nentries+m*gstride] = gq;                             \
-          }                                                                     \
-        }                                                                       \
-      }                                                                         \
-    }                                                                           \
-  }                                                                             \
-}
-
-/*------------------------------------------------------------------------------
-  The basic scatter kernel
-------------------------------------------------------------------------------*/
-#define DEFINE_SCATTER(T)                                                       \
-@kernel void scatter_##T(const dlong Nblocks,                                   \
-                         const int   Nentries,                                  \
-                         const int   Nvectors,                                  \
-                         const dlong gstride,                                   \
-                         const dlong stride,                                    \
-                         @restrict const dlong *blockStarts,                    \
-                         @restrict const dlong *scatterStarts,                  \
-                         @restrict const dlong *scatterIds,                     \
-                         @restrict const     T *gatherq,                        \
-                         @restrict           T *q)                              \
-{                                                                               \
-  for(dlong m=0;m<Nvectors;++m;@outer(2)){                                      \
-    for(dlong k=0;k<Nentries;++k;@outer(1)){                                    \
-      for(dlong b=0;b<Nblocks;++b;@outer(0)){                                   \
-        @exclusive dlong blockStart, blockEnd, start;                           \
-        @shared T temp[p_gatherNodesPerBlock];                                  \
-                                                                                \
-        for(dlong n=0;n<p_blockSize;++n;@inner){                                \
-          blockStart = blockStarts[b];                                          \
-          blockEnd   = blockStarts[b+1];                                        \
-          start = scatterStarts[blockStart];                                    \
-          for (dlong row=blockStart+n;row<blockEnd;row+=p_blockSize) {          \
-            const dlong rowStart = scatterStarts[row]  -start;                  \
-            const dlong rowEnd   = scatterStarts[row+1]-start;                  \
-            temp[rowStart] = gatherq[k+row*Nentries+m*gstride];                 \
-            for (dlong i=rowStart+1;i<rowEnd;i++) {                             \
-              temp[i] = temp[rowStart];                                         \
-            }                                                                   \
-          }                                                                     \
-        }                                                                       \
-                                                                                \
-        for(dlong n=0;n<p_blockSize;++n;@inner){                                \
-          for (dlong id=start+n;id<scatterStarts[blockEnd];id+=p_blockSize) {   \
-            q[k+scatterIds[id]*Nentries+m*stride] = temp[id-start];             \
-          }                                                                     \
-        }                                                                       \
-      }                                                                         \
-    }                                                                           \
-  }                                                                             \
-}
-
-#define DEFINE_PROCS(T) \
-  OGS_FOR_EACH_OP(T,DEFINE_GATHERSCATTER) \
-  OGS_FOR_EACH_OP(T,DEFINE_GATHER) \
-  DEFINE_SCATTER(T)
-
-OGS_FOR_EACH_TYPE(DEFINE_PROCS)
diff --git a/libs/ogs/okl/ogsKernels.okl b/libs/ogs/okl/ogsKernels.okl
new file mode 100644
index 000000000..26b972a13
--- /dev/null
+++ b/libs/ogs/okl/ogsKernels.okl
@@ -0,0 +1,177 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+// OCCA properties will define T, OGS_OP_INIT, and OGS_OP
+
+/*------------------------------------------------------------------------------
+  The basic gather-scatter kernel
+------------------------------------------------------------------------------*/
+@kernel void gatherScatter(const dlong Nblocks,
+                           const int K,
+                          @restrict const dlong *blockStarts,
+                          @restrict const dlong *gatherStarts,
+                          @restrict const dlong *gatherIds,
+                          @restrict const dlong *scatterStarts,
+                          @restrict const dlong *scatterIds,
+                          @restrict           T *q) {
+
+  for(dlong k=0;k<K;++k;@outer(1)){
+    for(dlong b=0;b<Nblocks;++b;@outer(0)){
+      @exclusive dlong blockStart, blockEnd, gStart, sStart;
+      @shared T gtemp[p_gatherNodesPerBlock];
+      @shared T stemp[p_gatherNodesPerBlock];
+
+      for(dlong n=0;n<p_blockSize;++n;@inner(0)){
+        blockStart = blockStarts[b];
+        blockEnd   = blockStarts[b+1];
+        gStart = gatherStarts[blockStart];
+        sStart = scatterStarts[blockStart];
+
+        for (dlong id=gStart+n;id<gatherStarts[blockEnd];id+=p_blockSize) {
+          gtemp[id-gStart] = q[k+gatherIds[id]*K];
+        }
+      }
+
+      for(dlong n=0;n<p_blockSize;++n;@inner(0)){
+        for (dlong row=blockStart+n;row<blockEnd;row+=p_blockSize) {
+          const dlong gRowStart = gatherStarts[row]  -gStart;
+          const dlong gRowEnd   = gatherStarts[row+1]-gStart;
+          const dlong sRowStart = scatterStarts[row]  -sStart;
+          const dlong sRowEnd   = scatterStarts[row+1]-sStart;
+          T gq = OGS_OP_INIT;
+          for (dlong i=gRowStart;i<gRowEnd;i++) {
+            OGS_OP(gq,gtemp[i]);
+          }
+          for (dlong i=sRowStart;i<sRowEnd;i++) {
+            stemp[i] = gq;
+          }
+        }
+      }
+
+      for(dlong n=0;n<p_blockSize;++n;@inner(0)){
+        for (dlong id=sStart+n;id<scatterStarts[blockEnd];id+=p_blockSize) {
+          q[k+scatterIds[id]*K] = stemp[id-sStart];
+        }
+      }
+    }
+  }
+}
+
+/*------------------------------------------------------------------------------
+  The basic gather kernel
+------------------------------------------------------------------------------*/
+@kernel void gather(const dlong Nblocks,
+                    const int K,
+                   @restrict const dlong *blockStarts,
+                   @restrict const dlong *gatherStarts,
+                   @restrict const dlong *gatherIds,
+                   @restrict const     T *q,
+                   @restrict           T *gatherq){
+
+  for(dlong k=0;k<K;++k;@outer(1)){
+    for(dlong b=0;b<Nblocks;++b;@outer(0)){
+      @exclusive dlong blockStart, blockEnd, start;
+      @shared T temp[p_gatherNodesPerBlock];
+
+      for(dlong n=0;n<p_blockSize;++n;@inner(0)){
+        blockStart = blockStarts[b];
+        blockEnd   = blockStarts[b+1];
+        start = gatherStarts[blockStart];
+
+        for (dlong id=start+n;id<gatherStarts[blockEnd];id+=p_blockSize) {
+          temp[id-start] = q[k+gatherIds[id]*K];
+        }
+      }
+
+      for(dlong n=0;n<p_blockSize;++n;@inner(0)){
+        for (dlong row=blockStart+n;row<blockEnd;row+=p_blockSize) {
+          const dlong rowStart = gatherStarts[row]  -start;
+          const dlong rowEnd   = gatherStarts[row+1]-start;
+          T gq = OGS_OP_INIT;
+          for (dlong i=rowStart;i<rowEnd;i++) {
+            OGS_OP(gq,temp[i]);
+          }
+          gatherq[k+row*K] = gq;
+        }
+      }
+    }
+  }
+}
+
+/*------------------------------------------------------------------------------
+  The basic scatter kernel
+------------------------------------------------------------------------------*/
+@kernel void scatter(const dlong Nblocks,
+                     const int K,
+                     @restrict const dlong *blockStarts,
+                     @restrict const dlong *scatterStarts,
+                     @restrict const dlong *scatterIds,
+                     @restrict const     T *gatherq,
+                     @restrict           T *q) {
+
+  for(dlong k=0;k<K;++k;@outer(1)){
+    for(dlong b=0;b<Nblocks;++b;@outer(0)){
+      @exclusive dlong rowStart, rowEnd;
+      @shared T temp[p_gatherNodesPerBlock];
+
+      for(dlong n=0;n<p_blockSize;++n;@inner(0)){
+        rowStart = blockStarts[b];
+        rowEnd   = blockStarts[b+1];
+        dlong idStart = scatterStarts[rowStart];
+        dlong row = n+rowStart;
+        while (row<rowEnd) {
+          const int colStart = scatterStarts[row]  -idStart;
+          const int colEnd   = scatterStarts[row+1]-idStart;
+          T foo = gatherq[k+row*K];
+          for (int i=colStart;i<colEnd;i++) {
+            temp[i] = foo;
+          }
+          row += p_blockSize;
+        }
+      }
+
+      for(dlong n=0;n<p_blockSize;++n;@inner(0)){
+        const dlong row = scatterStarts[rowStart]+n;
+        for (dlong i=0;row+i<scatterStarts[rowEnd];i+=p_blockSize) {
+          q[k+scatterIds[row+i]*K] = temp[i+n];
+        }
+      }
+    }
+  }
+}
+
+//extract sparse entries from vector
+@kernel void extract(const dlong N,
+                     const int K,
+                     @restrict const dlong *ids,
+                     @restrict const T *q,
+                           @restrict T *gatherq) {
+  for(dlong n=0;n<N*K;++n;@tile(p_blockSize, @outer(0), @inner(0))){
+    const dlong gid = n/K;
+    const int k = n%K;
+    gatherq[n] = q[k+ids[gid]*K];
+  }
+}
diff --git a/libs/parAdogs/parAdogsConnect.cpp b/libs/parAdogs/parAdogsConnect.cpp
new file mode 100644
index 000000000..719aaf226
--- /dev/null
+++ b/libs/parAdogs/parAdogsConnect.cpp
@@ -0,0 +1,242 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsGraph.hpp"
+
+namespace libp {
+
+namespace paradogs {
+
+typedef struct {
+  hlong v[graph_t::MAX_NFACEVERTS]; // vertices on face
+  hlong element, elementN;
+  int face, faceN;    // face info
+  int rank;
+
+}parallelFace_t;
+
+
+void graph_t::Connect(){
+
+  /*Global number of elements*/
+  gNVertsGlobal=static_cast<hlong>(Nverts);
+  gcomm.Allreduce(gNVertsGlobal);
+
+  /*Get global element count offsets*/
+  hlong localNverts=static_cast<hlong>(Nverts);
+  gcomm.Scan(localNverts, gVoffsetU);
+  gVoffsetL = gVoffsetU-Nverts;
+
+  /* build list of faces */
+  memory<parallelFace_t> faces(Nelements*Nfaces);
+
+  for(dlong e=0;e<Nelements;++e){
+    for(int f=0;f<Nfaces;++f){
+      const dlong id = f+Nfaces*e;
+
+      for(int n=0;n<NfaceVerts;++n){
+        dlong vid = faceVerts[f*NfaceVerts+n];
+        faces[id].v[n] = elements[e].V[vid];
+      }
+
+      std::sort(faces[id].v, faces[id].v+NfaceVerts,
+                std::less<hlong>());
+
+      faces[id].element = e + gVoffsetL;
+      faces[id].face = f;
+
+      faces[id].elementN= -1;
+      faces[id].faceN = -1;
+    }
+  }
+
+  /* sort faces by their vertex number pairs */
+  std::sort(faces.ptr(), faces.ptr()+Nelements*Nfaces,
+            [&](const parallelFace_t& a, const parallelFace_t& b) {
+              return std::lexicographical_compare(a.v, a.v+NfaceVerts,
+                                                  b.v, b.v+NfaceVerts);
+            });
+
+  /* scan through sorted face lists looking for adjacent
+     faces that have the same vertex ids */
+  for(dlong n=0;n<Nelements*Nfaces-1;++n){
+    if(std::equal(faces[n].v, faces[n].v+NfaceVerts,
+                  faces[n+1].v)){
+      // match
+      faces[n].elementN = faces[n+1].element;
+      faces[n].faceN = faces[n+1].face;
+
+      faces[n+1].elementN = faces[n].element;
+      faces[n+1].faceN = faces[n].face;
+      ++n;
+    }
+  }
+
+  /* resort faces back to the original element/face ordering */
+  std::sort(faces.ptr(), faces.ptr()+Nelements*Nfaces,
+            [](const parallelFace_t& a, const parallelFace_t& b) {
+              if(a.element < b.element) return true;
+              if(a.element > b.element) return false;
+
+              return (a.face < b.face);
+            });
+
+  /* extract the element to element and element to face connectivity */
+
+  // count # of elements to send to each rank based on
+  // minimum {vertex id % gsize}
+  memory<int> Nsend(gsize, 0);
+  memory<int> Nrecv(gsize);
+  memory<int> sendOffsets(gsize);
+  memory<int> recvOffsets(gsize);
+
+  int allNsend=0;
+  for(dlong e=0;e<Nelements;++e){
+    for(int f=0;f<Nfaces;++f){
+      const dlong id = f+Nfaces*e;
+      if (faces[id].elementN>-1) { /*matched face*/
+        elements[e].E[f] = faces[id].elementN; //global id
+        elements[e].F[f] = faces[id].faceN;
+      } else { /*unmatched*/
+        elements[e].E[f] = -1; //global id
+        elements[e].F[f] = -1; /*mark face*/
+
+        // find rank of destination for sorting based on min(face vertices)%gsize
+        int destRank = static_cast<int>(faces[id].v[0]%gsize);
+
+        // increment send gsize for
+        ++Nsend[destRank];
+        ++allNsend;
+      }
+    }
+  }
+
+  // find send offsets
+  sendOffsets[0]=0;
+  for(int rr=1;rr<gsize;++rr)
+    sendOffsets[rr] = sendOffsets[rr-1] + Nsend[rr-1];
+
+  // reset counters
+  for(int rr=0;rr<gsize;++rr)
+    Nsend[rr] = 0;
+
+  // buffer for outgoing data
+  memory<parallelFace_t> sendFaces(allNsend);
+
+  // pack face data
+  for(dlong e=0;e<Nelements;++e){
+    for(int f=0;f<Nfaces;++f){
+      const dlong id = f+Nfaces*e;
+      if (faces[id].elementN==-1) { /*unmatched face*/
+
+        // find rank of destination for sorting based on min(face vertices)%gsize
+        int destRank = static_cast<int>(faces[id].v[0]%gsize);
+
+        // populate face to send out staged in segment of sendFaces array
+        const int sid = sendOffsets[destRank]+Nsend[destRank];
+        sendFaces[sid] = faces[id];
+        sendFaces[sid].rank = grank;
+        ++Nsend[destRank];
+      }
+    }
+  }
+  faces.free();
+
+  // exchange counts
+  gcomm.Alltoall(Nsend, Nrecv);
+
+  // count incoming faces
+  int allNrecv = 0;
+  for(int rr=0;rr<gsize;++rr)
+    allNrecv += Nrecv[rr];
+
+  // find offsets for recv data
+  recvOffsets[0]=0;
+  for(int rr=1;rr<gsize;++rr)
+    recvOffsets[rr] = recvOffsets[rr-1] + Nrecv[rr-1]; // byte offsets
+
+  // buffer for incoming face data
+  memory<parallelFace_t> recvFaces(allNrecv);
+
+  // exchange parallel faces
+  gcomm.Alltoallv(sendFaces, Nsend, sendOffsets,
+                  recvFaces, Nrecv, recvOffsets);
+
+  // local sort allNrecv received faces
+  std::sort(recvFaces.ptr(), recvFaces.ptr()+allNrecv,
+            [&](const parallelFace_t& a, const parallelFace_t& b) {
+              return std::lexicographical_compare(a.v, a.v+NfaceVerts,
+                                                  b.v, b.v+NfaceVerts);
+            });
+
+  // find matches
+  for(int n=0;n<allNrecv-1;++n){
+    // since vertices are ordered we just look for pairs
+    if(std::equal(recvFaces[n].v, recvFaces[n].v+NfaceVerts,
+                  recvFaces[n+1].v)){
+      recvFaces[n].elementN = recvFaces[n+1].element;
+      recvFaces[n].faceN = recvFaces[n+1].face;
+
+      recvFaces[n+1].elementN = recvFaces[n].element;
+      recvFaces[n+1].faceN = recvFaces[n].face;
+      ++n;
+    }
+  }
+
+  // sort back to original ordering
+  std::sort(recvFaces.ptr(), recvFaces.ptr()+allNrecv,
+            [](const parallelFace_t& a, const parallelFace_t& b) {
+              if(a.rank < b.rank) return true;
+              if(a.rank > b.rank) return false;
+
+              if(a.element < b.element) return true;
+              if(a.element > b.element) return false;
+
+              return (a.face < b.face);
+            });
+
+  // send faces back from whence they came
+  gcomm.Alltoallv(recvFaces, Nrecv, recvOffsets,
+                  sendFaces, Nsend, sendOffsets);
+
+  // extract connectivity info
+  for(int cnt=0;cnt<allNsend;++cnt){
+    dlong e = static_cast<dlong>(sendFaces[cnt].element-gVoffsetL);
+    hlong eN = sendFaces[cnt].elementN;
+    int f = sendFaces[cnt].face;
+    int fN = sendFaces[cnt].faceN;
+
+    if(eN>=0 && fN>=0){ /*match found*/
+      elements[e].E[f] = eN;
+      elements[e].F[f] = fN;
+    }
+  }
+}
+
+} //namespace paradogs
+
+} //namespace libp
diff --git a/libs/parAdogs/parAdogsCuthillMckee.cpp b/libs/parAdogs/parAdogsCuthillMckee.cpp
new file mode 100644
index 000000000..22e3cabf7
--- /dev/null
+++ b/libs/parAdogs/parAdogsCuthillMckee.cpp
@@ -0,0 +1,153 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsGraph.hpp"
+#include "parAdogs/parAdogsPartition.hpp"
+#include <queue>
+
+namespace libp {
+
+namespace paradogs {
+
+void graph_t::CuthillMckee() {
+
+  /*Look for first node with lowest degree*/
+  int minDegree=Nfaces+1;
+  dlong minloc=-1;
+  for (dlong e=0;e<Nelements;++e) {
+    int degree=0;
+    for (int f=0;f<Nfaces;++f) {
+      const hlong eN = elements[e].E[f];
+      if ((eN!=-1) && (eN>=gVoffsetL) && (eN<gVoffsetU) ) {
+        degree++;
+      }
+    }
+    if (degree<minDegree) {
+      minDegree=degree;
+      minloc=e;
+    }
+  }
+
+  /*Create an ordering via Cuthill Mckee*/
+  std::queue<dlong> q;
+
+  memory<hlong> newId(Nelements); //TODO halo region here
+
+  /*mark nodes as unvisted*/
+  memory<bool> visited(Nelements, false);
+
+  /*Start with minimal degree element*/
+  q.push(minloc);
+  visited[minloc] = true;
+
+  dlong cnt=0;
+  do {
+    if (q.empty()) {
+      if (cnt==Nelements){
+        break; //Done
+      } else {
+        /*Disconnected? Pick another random node and try to keep growing*/
+        minDegree=Nfaces+1;
+        minloc=-1;
+        for (dlong e=0;e<Nelements;++e) {
+          if (visited[e]==true) continue;
+
+          int degree=0;
+          for (int f=0;f<Nfaces;++f) {
+            const hlong eN = elements[e].E[f];
+            if ((eN!=-1) && (eN>=gVoffsetL) && (eN<gVoffsetU) ) {
+              degree++;
+            }
+          }
+          if (degree<minDegree) {
+            minDegree=degree;
+            minloc=e;
+          }
+        }
+
+        q.push(minloc);
+        visited[minloc] = true;
+      }
+    }
+
+    const dlong e = q.front();
+    q.pop();
+
+    /*Give this node a new global index*/
+    newId[e] = gVoffsetL+cnt++;
+
+    /*Add all neighbors to the queue*/
+    for (int f=0;f<Nfaces;++f) {
+      const hlong eN = elements[e].E[f];
+      if ((eN!=-1) && (eN>=gVoffsetL) && (eN<gVoffsetU) ) {
+        const dlong eL = static_cast<dlong>(eN-gVoffsetL); //local id
+        if (visited[eL]==false) {
+          q.push(eL);
+          visited[eL]=true;
+        }
+      }
+    }
+  } while(true);
+
+  /*we now have a new local odering*/
+
+  /*Share the new ids*/
+  //TODO halo exchange here
+
+  /*Update connectivity*/
+  for(dlong e=0;e<Nelements;++e) {
+    for (int f=0;f<Nfaces;++f) {
+      const hlong eN = elements[e].E[f];
+      if (eN!=-1) {
+        if ((eN>=gVoffsetL) && (eN<gVoffsetU) ) {
+          dlong eL = static_cast<dlong>(eN-gVoffsetL);
+          elements[e].E[f] = newId[eL];
+        } else {
+          /*Need to think about how to update. Maybe it's easier to wrangle the graph for this? */
+        }
+      }
+    }
+  }
+
+  /*Permute local arrays to new ordering*/
+  for(dlong e=0;e<Nelements;++e) {
+    //get what index element e should move to
+    dlong pe = static_cast<dlong>(newId[e]-gVoffsetL);
+    while (pe!=e) {
+      //swap
+      std::swap(elements[e], elements[pe]);
+
+      std::swap(newId[e], newId[pe]);
+      pe = static_cast<dlong>(newId[e]-gVoffsetL);
+    }
+  }
+}
+
+} //namespace paradogs
+
+} //namespace libp
+
diff --git a/libs/parAdogs/parAdogsFiedlerVector.cpp b/libs/parAdogs/parAdogsFiedlerVector.cpp
new file mode 100644
index 000000000..99de2a028
--- /dev/null
+++ b/libs/parAdogs/parAdogsFiedlerVector.cpp
@@ -0,0 +1,176 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsGraph.hpp"
+#include <limits>
+
+extern "C" {
+  void dsyev_ (char *JOBZ, char *UPLO, int *N, double *A, int *LDA, double *W, double *WORK, int *LWORK, int *INFO);
+}
+
+namespace libp {
+
+namespace paradogs {
+
+/*Compute Fiedler vector of graph via multilevel heirarchy*/
+memory<dfloat>& graph_t::FiedlerVector() {
+
+  /*Fiedler vector on coarsest level*/
+  L[Nlevels-1].FiedlerVector();
+
+  /*Project and improve the Fiedler vector to the fine level*/
+  for (int l=Nlevels-2;l>=0;--l) {
+    /*Prolongate Fiedler vector to fine graph*/
+    L[l].P.SpMV(1.0, L[l+1].Fiedler, 0.0, L[l].Fiedler);
+
+    /*Refine the Fiedler vector*/
+    Refine(l);
+  }
+
+  return L[0].Fiedler;
+}
+
+
+
+/*Compute Fiedler vector of graph Laplacian*/
+void mgLevel_t::FiedlerVector() {
+
+  const int N = static_cast<int>(A.Nrows);
+
+  int size = A.comm.size();
+  memory<int> counts(size);
+  memory<int> offsets(size);
+
+  //collect partitioning info
+  A.comm.Allgather(N, counts);
+
+  int Ntotal=0;
+  for (int r=0;r<size;++r) {
+    Ntotal+=counts[r];
+  }
+  offsets[0]=0;
+  for (int r=1;r<size;++r) {
+    offsets[r]= offsets[r-1] + counts[r-1];
+  }
+
+  //populate local dense matrix
+  memory<double> localA(N*Ntotal, 0.0);
+
+  /*Add sparse entries*/
+  #pragma omp parallel for
+  for (int n=0;n<N;n++) {
+    const int start = static_cast<int>(A.diag.rowStarts[n]);
+    const int end   = static_cast<int>(A.diag.rowStarts[n+1]);
+    for (int m=start;m<end;m++) {
+      const int col = static_cast<int>(A.diag.cols[m] + A.colOffsetL);
+      localA[n*Ntotal+col] += A.diag.vals[m];
+    }
+  }
+  #pragma omp parallel for
+  for (int n=0;n<A.offd.nzRows;n++) {
+    const int row   = static_cast<int>(A.offd.rows[n]);
+    const int start = static_cast<int>(A.offd.mRowStarts[n]);
+    const int end   = static_cast<int>(A.offd.mRowStarts[n+1]);
+    for (int m=start;m<end;m++) {
+      const int col = static_cast<int>(A.colMap[A.offd.cols[m]]);
+      localA[row*Ntotal+col] += A.offd.vals[m];
+    }
+  }
+
+  //assemble the full matrix
+  memory<double> M(Ntotal*Ntotal);
+
+  for (int r=0;r<size;++r) {
+    counts[r] *= Ntotal;
+    offsets[r] *= Ntotal;
+  }
+
+  A.comm.Allgatherv(localA, N*Ntotal,
+                    M, counts, offsets);
+
+  localA.free();
+  counts.free();
+  offsets.free();
+
+  /*Call LaPack to find eigen pairs*/
+  int INFO = -999;
+  char JOBZ='V';
+  char UPLO='L';
+  int LWORK = -1;
+  int LDA = Ntotal;
+  double WORKSIZE=0.0;
+  memory<double> W(Ntotal);
+  dsyev_(&JOBZ, &UPLO, &Ntotal, M.ptr(), &LDA, W.ptr(), &WORKSIZE, &LWORK, &INFO); //Size query
+
+  LWORK = int(WORKSIZE);
+  double *WORK= new double[LWORK];
+  dsyev_(&JOBZ, &UPLO, &Ntotal, M.ptr(), &LDA, W.ptr(), WORK, &LWORK, &INFO);
+  delete[] WORK;
+
+  LIBP_ABORT("Paradogs: dsyev_ reports info = " << INFO << " in FiedlerVector",
+             INFO);
+
+  /*Find the second smallest eigenvalue (the smallest is 0)*/
+  double min0 = std::numeric_limits<double>::max();
+  double min1 = std::numeric_limits<double>::max();
+  int minloc0 = -1;
+  int minloc1 = -1;
+  for (int i=0;i<Ntotal;++i) {
+    // printf("Eig[%d] = %f\n", i, W[i]);
+
+    if (W[i]<min0) {
+      min1 = min0;
+      min0 = W[i];
+      minloc1 = minloc0;
+      minloc0 = i;
+    } else if (W[i]<min1) {
+      min1 = W[i];
+      minloc1 = i;
+    }
+  }
+
+  // printf("min1 = %f, minloc1 = %d \n", min1, minloc1);
+
+  memory<double> minV = M + minloc1*Ntotal;
+  for (int i=0;i<N;++i) {
+    Fiedler[i] = minV[i+A.rowOffsetL];
+  }
+
+  // Fiedler vector is already orthogonal to null
+
+  /* Fiedler vector is probably already normalized, but just in case */
+  dfloat norm = 0.0;
+  for (dlong n=0;n<N;++n) norm += Fiedler[n]*Fiedler[n];
+  A.comm.Allreduce(norm);
+  norm = sqrt(norm);
+
+  for (dlong n=0;n<N;++n) Fiedler[n] /= norm;
+}
+
+} //namespace paradogs
+
+} //namespace libp
diff --git a/libs/parAdogs/parAdogsGraph.cpp b/libs/parAdogs/parAdogsGraph.cpp
new file mode 100644
index 000000000..698b27da0
--- /dev/null
+++ b/libs/parAdogs/parAdogsGraph.cpp
@@ -0,0 +1,426 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsGraph.hpp"
+
+namespace libp {
+
+namespace paradogs {
+
+/*Build a graph from mesh connectivity info*/
+graph_t::graph_t(platform_t &_platform,
+                 const dlong _Nelements,
+                 const int _dim,
+                 const int _Nverts,
+                 const int _Nfaces,
+                 const int _NfaceVerts,
+                 const memory<int>& faceVertices,
+                 const memory<hlong>& EToV,
+                 const memory<dfloat>& EX,
+                 const memory<dfloat>& EY,
+                 const memory<dfloat>& EZ,
+                 comm_t _comm):
+  platform(_platform),
+  Nverts(_Nelements),
+  Nelements(_Nelements),
+  dim(_dim),
+  Nfaces(_Nfaces),
+  NelementVerts(_Nverts),
+  NfaceVerts(_NfaceVerts) {
+
+  gcomm = _comm.Dup();
+  grank = gcomm.rank();
+  gsize = gcomm.size();
+
+  comm  = _comm.Dup();
+  rank = comm.rank();
+  size = comm.size();
+
+  for (int n=0;n<Nfaces*NfaceVerts;++n)
+    faceVerts[n] = faceVertices[n];
+
+  /*Global number of elements*/
+  NVertsGlobal=static_cast<hlong>(Nverts);
+  comm.Allreduce(NVertsGlobal);
+
+  /*Get global element count offsets*/
+  hlong localNverts=static_cast<hlong>(Nverts);
+  comm.Scan(localNverts, VoffsetU);
+  VoffsetL = VoffsetU-Nverts;
+
+  gNVertsGlobal = NVertsGlobal;
+  gVoffsetL = VoffsetL;
+  gVoffsetU = VoffsetU;
+
+  /*Create array of packed element data*/
+  elements.malloc(Nelements);
+
+  if (dim==2) {
+    for (dlong e=0;e<Nelements;++e) {
+      for (int v=0;v<NelementVerts;++v) {
+        elements[e].EX[v] = EX[v+e*NelementVerts];
+        elements[e].EY[v] = EY[v+e*NelementVerts];
+
+        elements[e].V[v] = EToV[v+e*NelementVerts];
+      }
+      for (int f=0;f<Nfaces;++f) {
+        elements[e].E[f] = -1;
+        elements[e].F[f] = -1;
+      }
+    }
+  } else {
+    for (dlong e=0;e<Nelements;++e) {
+      for (int v=0;v<NelementVerts;++v) {
+        elements[e].EX[v] = EX[v+e*NelementVerts];
+        elements[e].EY[v] = EY[v+e*NelementVerts];
+        elements[e].EZ[v] = EZ[v+e*NelementVerts];
+
+        elements[e].V[v] = EToV[v+e*NelementVerts];
+      }
+      for (int f=0;f<Nfaces;++f) {
+        elements[e].E[f] = -1;
+        elements[e].F[f] = -1;
+      }
+    }
+  }
+}
+
+/*Globally divide graph into two pieces according to a bipartition*/
+void graph_t::Split(const memory<int>& partition) {
+
+  /*Count how much of each partition we have locally*/
+  dlong Nverts0=0;
+  dlong Nverts1=0;
+  for (dlong n=0;n<Nverts;++n) {
+    if (partition[n]==0) Nverts0++;
+    else                 Nverts1++;
+  }
+
+  hlong globalNverts0=static_cast<hlong>(Nverts0);
+  hlong globalNverts1=static_cast<hlong>(Nverts1);
+  comm.Allreduce(globalNverts0);
+  comm.Allreduce(globalNverts1);
+
+  /*Get offsets of partitions on each rank*/
+  memory<hlong> starts0(size+1);
+  memory<hlong> starts1(size+1);
+  starts0[0]=0;
+  starts1[0]=0;
+  hlong localNverts0 = static_cast<hlong>(Nverts0);
+  hlong localNverts1 = static_cast<hlong>(Nverts1);
+  comm.Allgather(localNverts0, starts0+1);
+  comm.Allgather(localNverts1, starts1+1);
+
+  for(int r=0;r<size;++r) {
+    starts0[r+1] += starts0[r];
+    starts1[r+1] += starts1[r];
+  }
+
+  /*Determine number of ranks to hold left and right partitions*/
+  const int size0 = (size+1)/2;
+  const int size1 = size-size0;
+
+  const hlong chunk0 = globalNverts0/size0;
+  const hlong chunk1 = globalNverts1/size1;
+
+  const int remainder0 = static_cast<int>(globalNverts0 - chunk0*size0);
+  const int remainder1 = static_cast<int>(globalNverts1 - chunk1*size1);
+
+  memory<int> Nsend0(size,0);
+  memory<int> Nsend1(size,0);
+  memory<int> Nrecv0(size);
+  memory<int> Nrecv1(size);
+  memory<int> sendOffsets0(size);
+  memory<int> sendOffsets1(size);
+  memory<int> recvOffsets0(size);
+  memory<int> recvOffsets1(size);
+
+  memory<hlong> newIds(Nverts+Nhalo);
+
+  /*Determine new ids and send counts*/
+  dlong cnt0=0;
+  dlong cnt1=0;
+  for(dlong e=0;e<Nverts;++e){
+    if (partition[e]==0) {
+      // new global element index
+      const hlong ep = starts0[rank]+cnt0++;
+      newIds[e] = ep;
+
+      // 0, chunk+1, 2*(chunk+1) ..., remainder*(chunk+1), remainder*(chunk+1) + chunk
+      int r;
+      if(ep<remainder0*(chunk0+1))
+        r = ep/(chunk0+1);
+      else
+        r = remainder0 + ((ep-remainder0*(chunk0+1))/chunk0);
+
+      ++Nsend0[r];
+    } else {
+      // new global element index
+      const hlong ep = starts1[rank]+cnt1++;
+      newIds[e] = ep;
+
+      // 0, chunk+1, 2*(chunk+1) ..., remainder*(chunk+1), remainder*(chunk+1) + chunk
+      int r;
+      if(ep<remainder1*(chunk1+1))
+        r = ep/(chunk1+1);
+      else
+        r = remainder1 + ((ep-remainder1*(chunk1+1))/chunk1);
+
+      ++Nsend1[r+size0];
+    }
+  }
+
+  starts0.free();
+  starts1.free();
+
+  if (L[0].Nglobal) {
+    /*If we have connected the elements, share the newIds*/
+    L[0].A.halo.Exchange(newIds, 1);
+
+    /*Then update the connectivity*/
+    dlong cnt=0;
+    for(dlong e=0;e<Nverts;++e){
+      const int part = partition[e];
+      for (int f=0;f<Nfaces;++f) {
+        const hlong gE = elements[e].E[f];
+        if (gE!=-1) {
+          dlong eN;
+          if (gE>=VoffsetL && gE<VoffsetU) { /*local neighbor*/
+            eN = static_cast<dlong>(gE-VoffsetL);
+          } else { /*halo neighbor*/
+            eN = colIds[cnt++]; /*Get the local id in the halo (we make this when building the Laplacian)*/
+          }
+
+          const int partN = partition[eN];
+          if (partN==part) { /*If both elements are in the same partition*/
+            elements[e].E[f] = newIds[eN]; /*Re index*/
+          } else {
+            elements[e].E[f] = -1;/*else break connections across the partitions*/
+          }
+        }
+      }
+    }
+  }
+  newIds.free();
+
+  // find send offsets
+  sendOffsets0[0]=0;
+  sendOffsets1[0]=0;
+  for(int r=1;r<size;++r) {
+    sendOffsets0[r] = sendOffsets0[r-1] + Nsend0[r-1];
+    sendOffsets1[r] = sendOffsets1[r-1] + Nsend1[r-1];
+  }
+  int NsendTotal0=0;
+  int NsendTotal1=0;
+  for(int r=0;r<size;++r) {
+    NsendTotal0 += Nsend0[r];
+    NsendTotal1 += Nsend1[r];
+  }
+
+  // exchange counts
+  comm.Alltoall(Nsend0, Nrecv0);
+  comm.Alltoall(Nsend1, Nrecv1);
+
+  // find recv offsets
+  recvOffsets0[0]=0;
+  recvOffsets1[0]=0;
+  for(int r=1;r<size;++r) {
+    recvOffsets0[r] = recvOffsets0[r-1] + Nrecv0[r-1];
+    recvOffsets1[r] = recvOffsets1[r-1] + Nrecv1[r-1];
+  }
+
+  // count incoming clusters
+  dlong newNverts = 0;
+
+  if (rank<size0) {
+    for(int r=0;r<size;++r) {
+      newNverts += Nrecv0[r];
+    }
+  } else {
+    for(int r=0;r<size;++r) {
+      newNverts += Nrecv1[r];
+    }
+  }
+
+  /*make send buffers*/
+  memory<element_t> sendElements0(NsendTotal0);
+  memory<element_t> sendElements1(NsendTotal1);
+
+  cnt0=0;
+  cnt1=0;
+  for(dlong e=0;e<Nverts;++e){
+    if (partition[e]==0) {
+      sendElements0[cnt0++] = elements[e];
+    } else {
+      sendElements1[cnt1++] = elements[e];
+    }
+  }
+
+  /*make new list*/
+  Nverts = newNverts;
+  Nelements = newNverts;
+  elements.malloc(Nverts);
+
+  memory<element_t> null;
+
+  // exchange elements
+  if (rank<size0) {
+    comm.Alltoallv(sendElements0, Nsend0, sendOffsets0,
+                        elements, Nrecv0, recvOffsets0);
+    comm.Alltoallv(sendElements1, Nsend1, sendOffsets1,
+                            null, Nrecv1, recvOffsets1);
+  } else {
+    comm.Alltoallv(sendElements0, Nsend0, sendOffsets0,
+                            null, Nrecv0, recvOffsets0);
+    comm.Alltoallv(sendElements1, Nsend1, sendOffsets1,
+                        elements, Nrecv1, recvOffsets1);
+  }
+
+  comm_t newComm = comm.Split(rank<size0, rank);
+  comm.Free();
+  comm = newComm;
+
+  rank = comm.rank();
+  size = comm.size();
+
+  /*Global number of elements*/
+  NVertsGlobal=static_cast<hlong>(Nverts);
+  comm.Allreduce(NVertsGlobal);
+
+  /*Get global element count offsets*/
+  hlong localNverts=static_cast<hlong>(Nverts);
+  comm.Scan(localNverts, VoffsetU);
+  VoffsetL = VoffsetU-Nverts;
+}
+
+void graph_t::Report() {
+
+  /* Min,Avg,Max Element counts*/
+  hlong globalNverts = static_cast<hlong>(Nverts);
+  gcomm.Allreduce(globalNverts);
+  dfloat avgNverts = static_cast<dfloat>(globalNverts)/gsize;
+
+  dlong minNverts=Nverts;
+  dlong maxNverts=Nverts;
+  gcomm.Allreduce(minNverts, Comm::Min);
+  gcomm.Allreduce(maxNverts, Comm::Max);
+
+
+  dlong cut=0.0;
+  for (dlong n=0;n<Nverts;++n) {
+    for (int f=0;f<Nfaces;++f) {
+      const hlong eN = elements[n].E[f];
+      if (eN!=-1) {
+        if ((eN<gVoffsetL) || (eN>=gVoffsetU) ) {
+          cut++;
+        }
+      }
+    }
+  }
+
+  hlong gCut = static_cast<hlong>(cut);
+  gcomm.Allreduce(gCut);
+  hlong avgCut = gCut/gsize;
+
+  dlong minCut=cut;
+  dlong maxCut=cut;
+  gcomm.Allreduce(minCut, Comm::Min);
+  gcomm.Allreduce(maxCut, Comm::Max);
+
+  if(grank==0) {
+    printf("--------------------------------------ParAdogs Report------------------------------------------\n");
+    printf("-----------------------------------------------------------------------------------------------\n");
+    printf("   Nranks   |    Elements   |   Per Rank Elements   |   Halo Faces   |   Per Rank Halo Faces  |\n");
+    printf("            |               |       (min,avg,max)   |                |         (min,avg,max)  |\n");
+    printf("-----------------------------------------------------------------------------------------------\n");
+    printf(      "%9d   | %11lld   |       %13lld   | %12lld   |         %13lld  |\n",
+            gsize,
+            static_cast<long long int>(globalNverts),
+            static_cast<long long int>(minNverts),
+            static_cast<long long int>(gCut),
+            static_cast<long long int>(minCut));
+    printf("            |               |       %13lld   |                |         %13lld  |\n",
+            static_cast<long long int>(avgNverts),
+            static_cast<long long int>(avgCut));
+    printf("            |               |       %13lld   |                |         %13lld  |\n",
+            static_cast<long long int>(maxNverts),
+            static_cast<long long int>(maxCut));
+    printf("-----------------------------------------------------------------------------------------------\n");
+  }
+}
+
+void graph_t::ExtractMesh(dlong &Nelements_,
+                          memory<hlong>& EToV,
+                          memory<hlong>& EToE,
+                          memory<int>& EToF,
+                          memory<dfloat>& EX,
+                          memory<dfloat>& EY,
+                          memory<dfloat>& EZ) {
+
+  /*Destroy any exiting mesh data and create new data from current graph*/
+  Nelements_ = Nelements;
+
+  EToV.malloc(Nelements*NelementVerts);
+  EToE.malloc(Nelements*NelementVerts);
+  EToF.malloc(Nelements*NelementVerts);
+
+  EX.malloc(Nelements*NelementVerts);
+  EY.malloc(Nelements*NelementVerts);
+  if (dim==3)
+    EZ.malloc(Nelements*NelementVerts);
+
+  if (dim==2) {
+    for (dlong e=0;e<Nelements;++e) {
+      for (int v=0;v<NelementVerts;++v) {
+        EToV[v+e*NelementVerts] = elements[e].V[v];
+        EX[v+e*NelementVerts] = elements[e].EX[v];
+        EY[v+e*NelementVerts] = elements[e].EY[v];
+      }
+      for (int f=0;f<Nfaces;++f) {
+        EToE[f+e*Nfaces] = elements[e].E[f];
+        EToF[f+e*Nfaces] = elements[e].F[f];
+      }
+    }
+  } else {
+    for (dlong e=0;e<Nelements;++e) {
+      for (int v=0;v<NelementVerts;++v) {
+        EToV[v+e*NelementVerts] = elements[e].V[v];
+        EX[v+e*NelementVerts] = elements[e].EX[v];
+        EY[v+e*NelementVerts] = elements[e].EY[v];
+        EZ[v+e*NelementVerts] = elements[e].EZ[v];
+      }
+      for (int f=0;f<Nfaces;++f) {
+        EToE[f+e*Nfaces] = elements[e].E[f];
+        EToF[f+e*Nfaces] = elements[e].F[f];
+      }
+    }
+  }
+}
+
+} //namespace paradogs
+
+} //namespace libp
diff --git a/libs/parAdogs/parAdogsInertialBipartition.cpp b/libs/parAdogs/parAdogsInertialBipartition.cpp
new file mode 100644
index 000000000..5a945bacf
--- /dev/null
+++ b/libs/parAdogs/parAdogsInertialBipartition.cpp
@@ -0,0 +1,201 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsGraph.hpp"
+#include "parAdogs/parAdogsPartition.hpp"
+
+extern "C" {
+  void dsyev_ (char *JOBZ, char *UPLO, int *N, double *A, int *LDA, double *W, double *WORK, int *LWORK, int *INFO);
+}
+
+namespace libp {
+
+namespace paradogs {
+
+/****************************************/
+/* Serial Inertial Bipartition          */
+/****************************************/
+void graph_t::InertialBipartition(const dfloat targetFraction[2]) {
+
+  memory<int> partition(Nverts);
+
+  memory<double> I;
+  I.calloc(9);
+
+  memory<dfloat> x, y, z;
+
+  if (dim==2) {
+    x.malloc(Nverts);
+    y.malloc(Nverts);
+
+    /*Compute center of mass of each element*/
+    for (dlong e=0;e<Nverts;++e) {
+      x[e]=0.0;
+      y[e]=0.0;
+      for (int v=0;v<NelementVerts;++v) {
+        x[e] += elements[e].EX[v];
+        y[e] += elements[e].EY[v];
+      }
+      x[e] /= NelementVerts;
+      y[e] /= NelementVerts;
+    }
+
+    /*Compute center of mass of whole mesh*/
+    memory<double> avg(2);
+
+    avg[0]=0.0;
+    avg[1]=0.0;
+    for (dlong e=0;e<Nverts;++e) {
+      avg[0] += x[e];
+      avg[1] += y[e];
+    }
+    comm.Allreduce(avg);
+
+    avg[0] /= NVertsGlobal;
+    avg[1] /= NVertsGlobal;
+
+    for (dlong e=0;e<Nverts;++e) {
+      const dfloat X = x[e] - avg[0];
+      const dfloat Y = y[e] - avg[1];
+
+      I[0] += X*X; I[1] += X*Y;
+      I[2] += Y*X; I[3] += Y*Y;
+    }
+    comm.Allreduce(I);
+
+  } else {
+    x.malloc(Nverts);
+    y.malloc(Nverts);
+    z.malloc(Nverts);
+
+    /*Compute center of mass of each element*/
+    for (dlong e=0;e<Nverts;++e) {
+      x[e]=0.0;
+      y[e]=0.0;
+      z[e]=0.0;
+      for (int v=0;v<NelementVerts;++v) {
+        x[e] += elements[e].EX[v];
+        y[e] += elements[e].EY[v];
+        z[e] += elements[e].EZ[v];
+      }
+      x[e] /= NelementVerts;
+      y[e] /= NelementVerts;
+      z[e] /= NelementVerts;
+    }
+
+    /*Compute center of mass of whole mesh*/
+    memory<double> avg(3);
+
+    avg[0]=0.0;
+    avg[1]=0.0;
+    avg[2]=0.0;
+    for (dlong e=0;e<Nverts;++e) {
+      avg[0] += x[e];
+      avg[1] += y[e];
+      avg[2] += z[e];
+    }
+    comm.Allreduce(avg);
+
+    avg[0] /= NVertsGlobal;
+    avg[1] /= NVertsGlobal;
+    avg[2] /= NVertsGlobal;
+
+    for (dlong e=0;e<Nverts;++e) {
+      const dfloat X = x[e] - avg[0];
+      const dfloat Y = y[e] - avg[1];
+      const dfloat Z = z[e] - avg[2];
+
+      I[0] += X*X; I[1] += X*Y; I[2] += X*Z;
+      I[3] += Y*X; I[4] += Y*Y; I[5] += Y*Z;
+      I[6] += Z*X; I[7] += Z*Y; I[8] += Z*Z;
+    }
+    comm.Allreduce(I);
+  }
+
+  /*Find the principal axis of inertia*/
+  int N = dim;
+  int INFO = -999;
+  char JOBZ='V';
+  char UPLO='L';
+  int LDA = N;
+  double W[3];
+  int LWORK = 8;
+  double WORK[8];
+  dsyev_(&JOBZ, &UPLO, &N, I.ptr(), &LDA, W, WORK, &LWORK, &INFO);
+
+  LIBP_ABORT("Paradogs: dsyev_ reports info = " << INFO << " in InertialBipartition",
+             INFO);
+
+  /*Find the largest eigenvalue*/
+  double max = W[0];
+  int maxloc = 0;
+  for (int i=1;i<dim;++i) {
+    if (W[i]>max) {
+      max = W[i];
+      maxloc = i;
+    }
+  }
+  // printf("max = %f, maxloc = %d \n", max, maxloc);
+
+  /*Princial axis is the eigenvector with largest eigenvalue*/
+  double a[3];
+  memory<double> maxV = I + maxloc*N;
+  for (int i=0;i<N;++i) {
+    a[i] = maxV[i];
+  }
+
+  /*Use principal axis to bipartion graph*/
+  memory<dfloat> F(Nverts);
+
+  if (dim==2) {
+    for (dlong e=0;e<Nverts;++e) {
+      F[e] = x[e]*a[0] + y[e]*a[1];
+    }
+  } else {
+    for (dlong e=0;e<Nverts;++e) {
+      F[e] = x[e]*a[0] + y[e]*a[1] + z[e]*a[2];
+    }
+  }
+
+  const hlong K = std::ceil(targetFraction[0]*NVertsGlobal);
+  const dfloat pivot = ParallelPivot(Nverts, F, K, comm);
+
+  for (dlong n=0;n<Nverts;++n) {
+    if (F[n]<=pivot) {
+      partition[n] = 0;
+    } else {
+      partition[n] = 1;
+    }
+  }
+
+  /*Split the graph according to this partitioning*/
+  Split(partition);
+}
+
+} //namespace paradogs
+
+} //namespace libp
diff --git a/libs/parAdogs/parAdogsInertialPartition.cpp b/libs/parAdogs/parAdogsInertialPartition.cpp
new file mode 100644
index 000000000..226586df6
--- /dev/null
+++ b/libs/parAdogs/parAdogsInertialPartition.cpp
@@ -0,0 +1,60 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsGraph.hpp"
+#include "parAdogs/parAdogsPartition.hpp"
+
+namespace libp {
+
+namespace paradogs {
+
+/*************************************************/
+/* k-Way Recusive Inertial Partitioning          */
+/*************************************************/
+void graph_t::InertialPartition() {
+
+  if (size==1) return;
+
+  /*Determine size of left and right partitions*/
+  const int size0 = (size+1)/2;
+  // const int size1 = size-size0;
+
+  /*Set target */
+  dfloat bipartitionFraction[2] = {0.0, 0.0};
+  bipartitionFraction[0] = static_cast<dfloat>(size0)/size;
+  bipartitionFraction[1] = 1.0 - bipartitionFraction[0];
+
+  /*Bipartition and redistribute, update size*/
+  InertialBipartition(bipartitionFraction);
+
+  /*Recursive call*/
+  InertialPartition();
+}
+
+} //namespace paradogs
+
+} //namespace libp
diff --git a/libs/parAdogs/parAdogsMatrix.cpp b/libs/parAdogs/parAdogsMatrix.cpp
new file mode 100644
index 000000000..cd6b4f543
--- /dev/null
+++ b/libs/parAdogs/parAdogsMatrix.cpp
@@ -0,0 +1,416 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsMatrix.hpp"
+#include <random>
+#include <algorithm>
+
+#ifdef GLIBCXX_PARALLEL
+#include <parallel/algorithm>
+using __gnu_parallel::sort;
+#else
+using std::sort;
+#endif
+
+namespace libp {
+
+namespace paradogs {
+
+std::mt19937 RNG;
+
+//------------------------------------------------------------------------
+//
+//  parCSR matrix
+//
+//------------------------------------------------------------------------
+
+void parCSR::SpMV(const dfloat alpha, memory<dfloat>& x,
+                  const dfloat beta, memory<dfloat>& y) {
+
+  halo.ExchangeStart(x, 1);
+
+  // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
+  #pragma omp parallel for
+  for(dlong i=0; i<Nrows; i++){ //local
+    dfloat result = 0.0;
+    for(dlong jj=diag.rowStarts[i]; jj<diag.rowStarts[i+1]; jj++)
+      result += diag.vals[jj]*x[diag.cols[jj]];
+
+    if (beta!=0.0)
+      y[i] = alpha*result + beta*y[i];
+    else
+      y[i] = alpha*result;
+  }
+
+  halo.ExchangeFinish(x, 1);
+
+  #pragma omp parallel for
+  for(dlong i=0; i<offd.nzRows; i++){ //local
+    const dlong row = offd.rows[i];
+    dfloat result = 0.0;
+    for(dlong jj=offd.mRowStarts[i]; jj<offd.mRowStarts[i+1]; jj++)
+      result += offd.vals[jj]*x[offd.cols[jj]];
+
+    y[row] += alpha*result;
+  }
+}
+
+void parCSR::SpMV(const dfloat alpha, memory<dfloat>& x,
+                  const dfloat beta, const memory<dfloat>& y, memory<dfloat>& z) {
+
+  halo.ExchangeStart(x, 1);
+
+  // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
+  #pragma omp parallel for
+  for(dlong i=0; i<Nrows; i++){ //local
+    dfloat result = 0.0;
+    for(dlong jj=diag.rowStarts[i]; jj<diag.rowStarts[i+1]; jj++)
+      result += diag.vals[jj]*x[diag.cols[jj]];
+
+    z[i] = alpha*result + beta*y[i];
+  }
+
+  halo.ExchangeFinish(x, 1);
+
+  #pragma omp parallel for
+  for(dlong i=0; i<offd.nzRows; i++){ //local
+    const dlong row = offd.rows[i];
+    dfloat result = 0.0;
+    for(dlong jj=offd.mRowStarts[i]; jj<offd.mRowStarts[i+1]; jj++)
+      result += offd.vals[jj]*x[offd.cols[jj]];
+
+    z[row] += alpha*result;
+  }
+}
+
+//------------------------------------------------------------------------
+//
+//  parCSR matrix setup
+//
+//------------------------------------------------------------------------
+
+//build a parCSR matrix from a distributed COO matrix
+parCSR::parCSR(dlong _Nrows, dlong _Ncols,
+               const dlong NNZ,
+               memory<nonZero_t>& entries,
+               const platform_t &_platform,
+               comm_t _comm):
+  platform(_platform),
+  comm(_comm) {
+
+  Nrows = _Nrows;
+  Ncols = _Ncols;
+
+  /*Get global row/col offsets*/
+  hlong localNrows = static_cast<hlong>(Nrows);
+  hlong localNcols = static_cast<hlong>(Ncols);
+  comm.Scan(localNrows, rowOffsetU);
+  comm.Scan(localNcols, colOffsetU);
+  rowOffsetL = rowOffsetU-Nrows;
+  colOffsetL = colOffsetU-Ncols;
+
+  diag.rowStarts.malloc(Nrows+1);
+  offd.rowStarts.malloc(Nrows+1);
+
+  #pragma omp parallel for
+  for (dlong n=0;n<Nrows+1;n++) {
+    diag.rowStarts[n]=0;
+    offd.rowStarts[n]=0;
+  }
+
+  // //count the entries in each row
+  for (dlong n=0;n<NNZ;n++) {
+    const dlong row = static_cast<dlong>(entries[n].row-rowOffsetL);
+    if (   (entries[n].col <  colOffsetL)
+        || (entries[n].col >= colOffsetU)) {
+      offd.rowStarts[row+1]++;
+    } else {
+      diag.rowStarts[row+1]++;
+    }
+  }
+
+  // count how many rows are shared
+  offd.nzRows=0;
+  for(dlong i=0; i<Nrows; i++)
+    if (offd.rowStarts[i+1]>0) offd.nzRows++;
+
+  offd.rows.malloc(offd.nzRows);
+  offd.mRowStarts.malloc(offd.nzRows+1);
+
+  // cumulative sum
+  dlong cnt=0;
+  offd.mRowStarts[0]=0;
+  for(dlong i=0; i<Nrows; i++) {
+    if (offd.rowStarts[i+1]>0) {
+      offd.rows[cnt] = i; //record row id
+      offd.mRowStarts[cnt+1] = offd.mRowStarts[cnt] + offd.rowStarts[i+1];
+      cnt++;
+    }
+    diag.rowStarts[i+1] += diag.rowStarts[i];
+    offd.rowStarts[i+1] += offd.rowStarts[i];
+  }
+  diag.nnz = diag.rowStarts[Nrows];
+  offd.nnz = offd.rowStarts[Nrows];
+
+  // Halo setup
+  cnt=0;
+  memory<hlong> colIds(offd.nnz);
+  for (dlong n=0;n<NNZ;n++) {
+    if (   (entries[n].col <  colOffsetL)
+        || (entries[n].col >= colOffsetU)) {
+      colIds[cnt++] = entries[n].col;
+    }
+  }
+  haloSetup(colIds); //setup halo, and transform colIds to a local indexing
+
+  // //fill the CSR matrices
+  diag.cols.malloc(diag.nnz);
+  offd.cols.malloc(offd.nnz);
+  diag.vals.malloc(diag.nnz);
+  offd.vals.malloc(offd.nnz);
+  dlong diagCnt = 0;
+  dlong offdCnt = 0;
+  for (dlong n=0;n<NNZ;n++) {
+    if (   (entries[n].col <  colOffsetL)
+        || (entries[n].col >= colOffsetU)) {
+      offd.cols[offdCnt] = colIds[offdCnt];
+      offd.vals[offdCnt] = entries[n].val;
+      offdCnt++;
+    } else {
+      diag.cols[diagCnt] = static_cast<dlong>(entries[n].col-colOffsetL);
+      diag.vals[diagCnt] = entries[n].val;
+      diagCnt++;
+    }
+  }
+}
+
+//------------------------------------------------------------------------
+//
+//  parCSR halo setup
+//
+//------------------------------------------------------------------------
+
+typedef struct {
+
+  dlong localId;
+  hlong globalId;
+
+  dlong newId;
+
+} parallelId_t;
+
+
+void parCSR::haloSetup(memory<hlong>& colIds) {
+
+  //collect the unique nonlocal column ids
+  memory<parallelId_t> parIds(offd.nnz);
+
+  for (dlong n=0;n<offd.nnz;n++) {
+    parIds[n].localId  = n;
+    parIds[n].globalId = colIds[n];
+  }
+
+  //sort by global index
+  sort(parIds.ptr(), parIds.ptr()+offd.nnz,
+       [](const parallelId_t& a, const parallelId_t& b) {
+         if(a.globalId < b.globalId) return true;
+         if(a.globalId > b.globalId) return false;
+
+         return (a.localId < b.localId);
+       });
+
+  //count unique nonlocal column ids
+  dlong Noffdcols = 0; //number of unique columns
+  if(offd.nnz) parIds[0].newId = Noffdcols;
+  for (dlong n=1;n<offd.nnz;n++) {
+    if (parIds[n].globalId != parIds[n-1].globalId)
+      Noffdcols++;
+
+    parIds[n].newId = Noffdcols;
+  }
+  if(offd.nnz) Noffdcols++;
+
+  //record the global ids of the unique columns
+  memory<hlong> offdcols(Noffdcols);
+  Noffdcols = 0;
+  if(offd.nnz) offdcols[Noffdcols++] = parIds[0].globalId;
+  for (dlong n=1;n<offd.nnz;n++)
+    if (parIds[n].globalId != parIds[n-1].globalId)
+      offdcols[Noffdcols++] = parIds[n].globalId;
+
+  //sort back to local order
+  sort(parIds.ptr(), parIds.ptr()+offd.nnz,
+       [](const parallelId_t& a, const parallelId_t& b) {
+         if(a.localId < b.localId) return true;
+         if(a.localId > b.localId) return false;
+
+         return (a.globalId < b.globalId);
+       });
+
+  // be careful to make sure Ncols is set at this point
+  NlocalCols = Ncols;
+  Ncols += Noffdcols;
+
+  //make an array of all the column ids required on this rank (local first)
+  colMap.malloc(Ncols);
+  for (dlong n=0; n<NlocalCols; n++)      colMap[n] = n+colOffsetL+1; //local rows
+  for (dlong n=NlocalCols; n<Ncols; n++)  colMap[n] = -(offdcols[n-NlocalCols]+1);    //nonlocal rows
+
+  //make a halo exchange to share column entries and an ogs for gsops accross columns
+  bool verbose = false;
+  halo.Setup(Ncols, colMap, comm, ogs::Pairwise, verbose, platform);
+
+  //shift back to 0-indexed
+  for (dlong n=0; n<Ncols; n++) colMap[n]=abs(colMap[n])-1;
+
+  //update column numbering
+  for (dlong n=0;n<offd.nnz;n++)
+    colIds[n] = NlocalCols + parIds[n].newId;
+}
+
+//------------------------------------------------------------------------
+//
+//  parCSR Estimate max Eigenvalue of diagA^{-1}*A
+//
+//------------------------------------------------------------------------
+
+dfloat parCSR::rhoDinvA(memory<dfloat>& null){
+
+  int k = 10;
+
+  hlong Ntotal = static_cast<hlong>(Nrows);
+  comm.Allreduce(Ntotal);
+  if(k > Ntotal) k = (int) Ntotal;
+
+  // do an arnoldi
+
+  // allocate memory for Hessenberg matrix
+  memory<double> H(k*k, 0.0);
+
+  // allocate memory for basis
+  memory<dfloat> V((k+1)*Nrows);
+  memory<dfloat> Vx(Ncols);
+
+  /*Create rng*/
+  std::uniform_real_distribution<dfloat> distrib(-0.5, 0.5);
+
+  // generate a random vector for initial basis vector
+  for(dlong n=0; n<Nrows; n++) Vx[n] = distrib(RNG);
+
+  /*Project out null vector*/
+  dfloat nulldot =0.0;
+  for(dlong n=0; n<Nrows; n++) nulldot += null[n]*Vx[n];
+  comm.Allreduce(nulldot);
+
+  #pragma omp parallel for
+  for(dlong n=0; n<Nrows; n++) Vx[n] -= nulldot*null[n];
+
+  // dfloat norm_vo = vectorNorm(Nrows,Vx, comm);
+  dfloat norm_vo=0.0;
+  for(dlong n=0; n<Nrows; n++) norm_vo += Vx[n]*Vx[n];
+  comm.Allreduce(norm_vo);
+  norm_vo = sqrt(norm_vo);
+
+  // vectorScale(Nrows, 1.0/norm_vo, Vx);
+  #pragma omp parallel for
+  for(dlong n=0; n<Nrows; n++) Vx[n] *= (1.0/norm_vo);
+
+  //V[0] = Vx
+  #pragma omp parallel for
+  for(dlong n=0; n<Nrows; n++) V[n] = Vx[n];
+
+  for(int j=0; j<k; j++){
+    memory<dfloat> Vj   = V+j*Nrows;
+    memory<dfloat> Vjp1 = V+(j+1)*Nrows;
+
+    //Vx = V[j]
+    #pragma omp parallel for
+    for(dlong n=0; n<Nrows; n++) Vx[n] = Vj[n];
+
+    // v[j+1] = invD*(A*v[j])
+    SpMV(1.0, Vx, 0., Vjp1);
+
+    // vectorDotStar(Nrows, diagInv, V[j+1]);
+    #pragma omp parallel for
+    for(dlong n=0; n<Nrows; n++) Vjp1[n] *= diagInv[n];
+
+    // modified Gram-Schmidth
+    for(int i=0; i<=j; i++){
+      memory<dfloat> Vi = V+i*Nrows;
+      // H(i,j) = v[i]'*A*v[j]
+      // dfloat hij = vectorInnerProd(Nrows, V[i], V[j+1],comm);
+      dfloat hij=0.0;
+      for(dlong n=0; n<Nrows; n++) hij += Vi[n]*Vjp1[n];
+      comm.Allreduce(hij);
+
+      // v[j+1] = v[j+1] - hij*v[i]
+      // vectorAdd(Nrows,-hij, V[i], 1.0, V[j+1]);
+      #pragma omp parallel for
+      for(dlong n=0; n<Nrows; n++) Vjp1[n] += -hij*Vi[n];
+
+      H[i + j*k] = (double) hij;
+    }
+
+    if(j+1 < k){
+
+      // dfloat norm_vj = vectorNorm(Nrows,V[j+1],comm);
+      dfloat norm_vj=0.0;
+      for(dlong n=0; n<Nrows; n++) norm_vj += Vjp1[n]*Vjp1[n];
+      comm.Allreduce(norm_vj);
+      norm_vj = sqrt(norm_vj);
+
+      H[j+1+ j*k] = (double) norm_vj;
+
+      // vectorScale(Nrows, 1./H[j+1 + j*k], V[j+1]);
+      #pragma omp parallel for
+      for(dlong n=0; n<Nrows; n++) Vjp1[n] *= (1./H[j+1 + j*k]);
+    }
+  }
+
+  memory<double> WR(k);
+  memory<double> WI(k);
+
+  linAlg_t::matrixEigenValues(k, H, WR, WI);
+
+  double RHO = 0.;
+
+  for(int i=0; i<k; i++){
+    double RHO_i  = sqrt(WR[i]*WR[i] + WI[i]*WI[i]);
+
+    if(RHO < RHO_i) {
+      RHO = RHO_i;
+    }
+  }
+
+  // printf("weight = %g \n", RHO);
+
+  return RHO;
+}
+
+} //namespace paradogs
+
+} //namespace libp
diff --git a/libs/parAdogs/parAdogsMeshPartition.cpp b/libs/parAdogs/parAdogsMeshPartition.cpp
new file mode 100644
index 000000000..e2c11c11f
--- /dev/null
+++ b/libs/parAdogs/parAdogsMeshPartition.cpp
@@ -0,0 +1,114 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsGraph.hpp"
+#include "timer.hpp"
+#include <random>
+
+namespace libp {
+
+namespace paradogs {
+
+extern std::mt19937 RNG;
+
+void MeshPartition(platform_t &platform,
+                   settings_t &settings,
+                   dlong &Nelements,
+                   const  int dim,
+                   const  int Nverts,
+                   const  int Nfaces,
+                   const  int NfaceVertices,
+                   const  memory<int>& faceVertices,
+                   memory<hlong>& EToV,
+                   memory<hlong>& EToE,
+                   memory<int>& EToF,
+                   memory<dfloat>& EX,
+                   memory<dfloat>& EY,
+                   memory<dfloat>& EZ,
+                   comm_t comm) {
+
+  /* Create RNG*/
+  RNG = std::mt19937(comm.rank());
+
+  /* Create graph from mesh info*/
+  graph_t graph(platform,
+                Nelements,
+                dim,
+                Nverts,
+                Nfaces,
+                NfaceVertices,
+                faceVertices,
+                EToV,
+                EX,
+                EY,
+                EZ,
+                comm);
+
+  timePoint_t timeStart = GlobalTime(comm);
+
+  if (settings.compareSetting("PARADOGS PARTITIONING", "INERTIAL")) {
+    /*Inertial partitioning*/
+    graph.InertialPartition();
+  } else if (settings.compareSetting("PARADOGS PARTITIONING", "SPECTRAL")) {
+    /*Connect element faces before partitioning*/
+    if (comm.size()>1) graph.Connect();
+
+    /*Spectral partitioning*/
+    graph.SpectralPartition();
+  }
+
+  /*Connect element faces after partitioning*/
+  graph.Connect();
+
+  /*Reorder rank-local element list for better locality*/
+  graph.CuthillMckee();
+
+  timePoint_t timeEnd = GlobalTime(comm);
+  double elaplsed = ElapsedTime(timeStart, timeEnd);
+
+  /*Print some stats about the partitioning*/
+  graph.Report();
+
+  if (comm.rank()==0) {
+    printf("   Partitioning time:  %5.2f seconds                                                          |\n",
+           elaplsed);
+    printf("-----------------------------------------------------------------------------------------------\n");
+  }
+
+  /*Get the new mesh data*/
+  graph.ExtractMesh(Nelements,
+                    EToV,
+                    EToE,
+                    EToF,
+                    EX,
+                    EY,
+                    EZ);
+}
+
+} //namespace paradogs
+
+} //namespace libp
diff --git a/libs/parAdogs/parAdogsMultigrid.cpp b/libs/parAdogs/parAdogsMultigrid.cpp
new file mode 100644
index 000000000..b2d98cda5
--- /dev/null
+++ b/libs/parAdogs/parAdogsMultigrid.cpp
@@ -0,0 +1,94 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsGraph.hpp"
+#include "parAdogs/parAdogsMultigrid.hpp"
+
+namespace libp {
+
+namespace paradogs {
+
+/****************************************/
+/* Multigrid vcycle                     */
+/****************************************/
+void graph_t::MultigridVcycle(const int l,
+                              memory<dfloat>& r,
+                              memory<dfloat>& x) {
+
+  //check for base level
+  if(l==Nlevels-1) {
+    coarseSolver.Solve(r, x);
+    return;
+  }
+
+  mgLevel_t& Lf = L[l];
+  memory<dfloat>& res = Lf.RES;
+
+  mgLevel_t& Lc = L[l+1];
+  memory<dfloat>& rC = Lc.RHS;
+  memory<dfloat>& xC = Lc.X;
+
+  //Pre smooth and then compute res = rhs-Ax
+  Lf.Smooth(r, x, true);
+  Lf.Residual(r, x, res);
+
+  // rhsC = P^T res
+  Lf.Coarsen(res, rC);
+
+  // Recursive call
+  MultigridVcycle(l+1, rC, xC);
+  // for (int n=0;n<Lc.Nrows;++n) xC[n] = rC[n];
+
+  // x = x + P xC
+  Lf.Prolongate(xC, x);
+
+  // Post smooth
+  Lf.Smooth(r, x, false);
+}
+
+void mgLevel_t::Residual(memory<dfloat>& r, memory<dfloat>& x, memory<dfloat>& res) {
+  A.SpMV(-1.0, x, 1.0, r, res);
+}
+
+void mgLevel_t::Coarsen(memory<dfloat>& x, memory<dfloat>& xC) {
+  R.SpMV(1.0, x, 0.0, xC);
+}
+
+void mgLevel_t::Prolongate(memory<dfloat>& xC, memory<dfloat>& x) {
+  P.SpMV(1.0, xC, 1.0, x);
+}
+
+void mgLevel_t::Smooth(memory<dfloat>& r, memory<dfloat>& x, const bool xIsZero) {
+  const int ChebyshevIterations=2;
+  A.SmoothChebyshev(r, x, lambda0, lambda1,
+                     xIsZero, scratch,
+                     ChebyshevIterations);
+}
+
+} //namespace paradogs
+
+} //namespace libp
diff --git a/libs/parAdogs/parAdogsMultigridAggregate.cpp b/libs/parAdogs/parAdogsMultigridAggregate.cpp
new file mode 100644
index 000000000..892c35ff2
--- /dev/null
+++ b/libs/parAdogs/parAdogsMultigridAggregate.cpp
@@ -0,0 +1,315 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsMatrix.hpp"
+#include "parAdogs/parAdogsPartition.hpp"
+#include <random>
+
+namespace libp {
+
+namespace paradogs {
+
+extern std::mt19937 RNG;
+
+/*Create a vertex matching using distance-2 aggregation*/
+void parCSR::Aggregate(dlong& Nc,
+                       const dfloat theta,
+                       memory<hlong>& FineToCoarse) {
+
+  /*Create rng*/
+  std::uniform_real_distribution<> distrib(-0.25, 0.25);
+
+  parCSR strong(Nrows, Ncols, platform, comm);
+  strong.diag.rowStarts.malloc(Nrows+1);
+
+  #pragma omp parallel for
+  for(dlong i=0; i<Nrows+1; i++) {
+    strong.diag.rowStarts[i]=0;
+  }
+
+  #pragma omp parallel for
+  for(dlong i=0; i<Nrows; i++){
+    int strong_per_row = 0;
+
+    const dfloat Aii = diagA[i];
+
+    //local entries
+    dlong Jstart = diag.rowStarts[i];
+    dlong Jend   = diag.rowStarts[i+1];
+    for(dlong jj= Jstart; jj<Jend; jj++){
+      const dlong col = diag.cols[jj];
+      if (col==i) continue;
+
+      const dfloat Ajj = std::abs(diagA[col]);
+
+      if(std::abs(diag.vals[jj]) > theta*(sqrt(Aii*Ajj)))
+        strong_per_row++;
+    }
+    //non-local entries
+    Jstart = offd.rowStarts[i];
+    Jend   = offd.rowStarts[i+1];
+    for(dlong jj= Jstart; jj<Jend; jj++){
+      const dlong col = offd.cols[jj];
+      const dfloat Ajj = std::abs(diagA[col]);
+
+      if(std::abs(offd.vals[jj]) > theta*(sqrt(Aii*Ajj)))
+        strong_per_row++;
+    }
+
+    strong.diag.rowStarts[i+1] = strong_per_row;
+  }
+
+  // cumulative sum
+  for(dlong i=1; i<Nrows+1 ; i++) {
+    strong.diag.rowStarts[i] += strong.diag.rowStarts[i-1];
+  }
+  strong.diag.nnz = strong.diag.rowStarts[Nrows];
+  strong.diag.cols.malloc(strong.diag.nnz);
+  strong.diag.vals.malloc(strong.diag.nnz);
+
+  // fill in the columns for strong connections
+  // #pragma omp parallel for
+  for(dlong i=0; i<Nrows; i++){
+    const dfloat Aii = diagA[i];
+
+    dlong counter = strong.diag.rowStarts[i];
+
+    //local entries
+    dlong Jstart = diag.rowStarts[i];
+    dlong Jend   = diag.rowStarts[i+1];
+    for(dlong jj= Jstart; jj<Jend; jj++){
+      const dlong col = diag.cols[jj];
+      if (col==i) continue;
+
+      const dfloat Ajj = std::abs(diagA[col]);
+
+      if(std::abs(diag.vals[jj]) > theta*(sqrt(Aii*Ajj))) {
+        strong.diag.cols[counter] = col;
+        strong.diag.vals[counter++] = std::abs(diag.vals[jj]) + distrib(paradogs::RNG);
+      }
+    }
+    //non-local entries
+    Jstart = offd.rowStarts[i];
+    Jend   = offd.rowStarts[i+1];
+    for(dlong jj= Jstart; jj<Jend; jj++){
+      const dlong col = offd.cols[jj];
+
+      const dfloat Ajj = std::abs(diagA[col]);
+
+      if(std::abs(offd.vals[jj]) > theta*(sqrt(Aii*Ajj))) {
+        strong.diag.cols[counter] = col;
+        strong.diag.vals[counter++] = std::abs(offd.vals[jj]) + distrib(paradogs::RNG);
+      }
+    }
+  }
+
+  memory<float> rand(Ncols);
+  memory<int>   Ts(Ncols);
+  memory<float> Tr(Ncols);
+  memory<hlong> Tn(Ncols);
+
+  /*Initialize state array*/
+  /*  0 - Undecided */
+  /* -1 - Not MIS */
+  /*  1 - MIS */
+  memory<int>   state(Ncols, 0);
+
+  /*Use vertex degree with random noise to break ties*/
+  // #pragma omp parallel for
+  for (dlong n=0;n<Nrows;++n) {
+    rand[n] = strong.diag.rowStarts[n+1]
+              - strong.diag.rowStarts[n]
+              + distrib(paradogs::RNG);
+  }
+
+  //fill halo region
+  halo.Exchange(rand, 1);
+
+  do {
+    // first neighbours
+    #pragma omp parallel for
+    for(dlong n=0; n<Nrows; n++){
+      int    smax = state[n];
+
+      if (smax==1) continue;
+
+      float  rmax = rand[n];
+      hlong  nmax = colMap[n];
+
+      for(dlong j=strong.diag.rowStarts[n];j<strong.diag.rowStarts[n+1];j++){
+        const dlong k  = strong.diag.cols[j];
+        const int   sk = state[k];
+        const float rk = rand[k];
+        const hlong nk = colMap[k];
+        if ((sk>smax)              || /*If neighbor is MIS node*/
+           ((sk==smax)&&(rk>rmax)) || /*Else if it has a bigger weight*/
+           ((sk==smax)&&(rk==rmax)&&(nk>nmax))) { /*Rare, but just in case, break tie with index number*/
+          smax = sk;
+          rmax = rk;
+          nmax = nk;
+        }
+      }
+      Ts[n] = smax;
+      Tr[n] = rmax;
+      Tn[n] = nmax;
+    }
+
+    //share results
+    halo.Exchange(Ts, 1);
+    halo.Exchange(Tr, 1);
+    halo.Exchange(Tn, 1);
+
+    // second neighbours
+    #pragma omp parallel for
+    for(dlong n=0; n<Nrows; n++){
+      if (state[n]!=0) continue;
+
+      int   smax = Ts[n];
+      float rmax = Tr[n];
+      hlong nmax = Tn[n];
+
+      for(dlong j=strong.diag.rowStarts[n];j<strong.diag.rowStarts[n+1];j++){
+        const dlong k = strong.diag.cols[j];
+        const int   sk = Ts[k];
+        const float rk = Tr[k];
+        const dlong nk = Tn[k];
+        if ((sk>smax)              || /*If neighbor is MIS node*/
+           ((sk==smax)&&(rk>rmax)) || /*Else if it has a bigger weight*/
+           ((sk==smax)&&(rk==rmax)&&(nk>nmax))) { /*Rare, but just in case, break tie with index number*/
+          smax = sk;
+          rmax = rk;
+          nmax = nk;
+        }
+      }
+
+      // if I am the strongest among all the 1 and 2 ring neighbours
+      // I am an MIS node
+      if(nmax == colMap[n]) state[n] = 1;
+
+      // if there is an MIS node within distance 2, I am removed
+      if(smax>0) state[n] = -1;
+    }
+
+    //share results
+    halo.Exchange(state, 1);
+
+    // if number of undecided nodes = 0, algorithm terminates
+    hlong cnt = 0;
+    for (dlong n=0;n<Nrows;n++) if (state[n]==0) cnt++;
+    comm.Allreduce(cnt);
+
+    if (cnt==0) break;
+
+  } while(true);
+
+  rand.free();
+  Tr.free();
+  Tn.free();
+
+
+  // count the coarse nodes/aggregates
+  Nc=0;
+  for(dlong i=0; i<Nrows; i++)
+    if(state[i] == 1) Nc++;
+
+  /*Get global offsets*/
+  hlong localNc=static_cast<hlong>(Nc);
+  hlong NcOffsetL=0, NcOffsetU=0;
+  comm.Scan(localNc, NcOffsetU);
+  NcOffsetL = NcOffsetU-Nc;
+
+  /*Initialize Matching array*/
+  Nc=0;
+  for(dlong i=0; i<Nrows; i++) {
+    if(state[i] == 1) {
+      Ts[i] = 1;
+      FineToCoarse[i] = NcOffsetL+Nc++;
+    } else {
+      Ts[i] = -1;
+      FineToCoarse[i] = -1;
+    }
+  }
+
+  //share the initial aggregate flags
+  halo.Exchange(Ts, 1);
+  halo.Exchange(FineToCoarse, 1);
+
+  // first neighbours
+  #pragma omp parallel for
+  for(dlong n=0; n<Nrows; n++){
+    if (FineToCoarse[n]==-1) {
+      for(dlong j=strong.diag.rowStarts[n];j<strong.diag.rowStarts[n+1];j++){
+        const dlong k  = strong.diag.cols[j];
+        const int   sk = FineToCoarse[k];
+
+        /*If this node is an MIS node, join the aggregate*/
+        if (state[k]==1) {
+          FineToCoarse[n] = sk;
+          Ts[n] = 1;
+          break;
+        }
+      }
+    }
+  }
+
+  halo.Exchange(Ts, 1);
+  halo.Exchange(FineToCoarse, 1);
+
+  // second neighbours
+  #pragma omp parallel for
+  for(dlong n=0; n<Nrows; n++){
+    if (FineToCoarse[n]==-1) { //If we're still undecided
+      hlong cmax = -1;
+      float rmax = -1.0;
+      hlong kmax = -1;
+
+      for(dlong j=strong.diag.rowStarts[n];j<strong.diag.rowStarts[n+1];j++){
+        const dlong k = strong.diag.cols[j];
+        const int   sk = Ts[k];
+        const hlong nk = colMap[k];
+        if (sk!=-1) { /*If the neighbor is in the neighborhood of an MIS node*/
+          // const float rk = rand[k];
+          const float rk = strong.diag.vals[j];
+          if( (rk>rmax)            || /*If edge is strongest*/
+             ((rk==rmax)&&(nk>kmax))) { /*Rare, but just in case, break tie with index number*/
+            cmax = FineToCoarse[k];
+            rmax = rk;
+            kmax = nk;
+          }
+        }
+      }
+      FineToCoarse[n] = cmax;
+    }
+  }
+
+  //share results
+  halo.Exchange(FineToCoarse, 1);
+}
+
+} //namespace paradogs
+
+} //namespace libp
diff --git a/libs/parAdogs/parAdogsMultigridCoarseSolver.cpp b/libs/parAdogs/parAdogsMultigridCoarseSolver.cpp
new file mode 100644
index 000000000..74a8d0777
--- /dev/null
+++ b/libs/parAdogs/parAdogsMultigridCoarseSolver.cpp
@@ -0,0 +1,148 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsGraph.hpp"
+
+namespace libp {
+
+namespace paradogs {
+
+void coarseSolver_t::Solve(memory<dfloat>& rhs, memory<dfloat>& x) {
+
+  //gather the global rhs
+  comm.Allgatherv(rhs, N,
+                  grhs, coarseCounts, coarseOffsets);
+
+  #pragma omp parallel for
+  for (int n=0;n<N;++n) {
+    dfloat xn=0.0;
+    for (int m=0;m<coarseTotal;++m) {
+      xn += invA[n*coarseTotal + m]*grhs[m];
+    }
+    x[n] = xn;
+  }
+}
+
+void coarseSolver_t::Setup(parCSR& A, memory<dfloat>& null) {
+
+  comm = A.comm;
+  int size = comm.size();
+
+  N = static_cast<int>(A.Nrows);
+  Nrows = A.Nrows;
+  Ncols = A.Ncols;
+
+  coarseCounts.malloc(size);
+  coarseOffsets.malloc(size);
+
+  //collect partitioning info
+  comm.Allgather(N, coarseCounts);
+
+  coarseTotal=0;
+  for (int r=0;r<size;++r) {
+    coarseTotal+=coarseCounts[r];
+  }
+  coarseOffsets[0]=0;
+  for (int r=1;r<size;++r) {
+    coarseOffsets[r]= coarseOffsets[r-1] + coarseCounts[r-1];
+  }
+
+  //gather global null vector
+  memory<dfloat> gnull(coarseTotal);
+
+  comm.Allgatherv( null, N,
+                  gnull, coarseCounts, coarseOffsets);
+
+  //populate local dense matrix
+  memory<dfloat> localA(N*coarseTotal);
+
+  /*Fill the matrix with the null boost*/
+  #pragma omp parallel for
+  for (int n=0;n<N;n++) {
+    for (int m=0;m<coarseTotal;m++) {
+      localA[n*coarseTotal+m] = null[n]*gnull[m];
+    }
+  }
+  gnull.free();
+
+  /*Add sparse entries*/
+  #pragma omp parallel for
+  for (int n=0;n<N;n++) {
+    const int start = static_cast<int>(A.diag.rowStarts[n]);
+    const int end   = static_cast<int>(A.diag.rowStarts[n+1]);
+    for (int m=start;m<end;m++) {
+      const int col = static_cast<int>(A.diag.cols[m] + A.colOffsetL);
+      localA[n*coarseTotal+col] += A.diag.vals[m];
+    }
+  }
+  #pragma omp parallel for
+  for (int n=0;n<A.offd.nzRows;n++) {
+    const int row   = static_cast<int>(A.offd.rows[n]);
+    const int start = static_cast<int>(A.offd.mRowStarts[n]);
+    const int end   = static_cast<int>(A.offd.mRowStarts[n+1]);
+    for (int m=start;m<end;m++) {
+      const int col = static_cast<int>(A.colMap[A.offd.cols[m]]);
+      localA[row*coarseTotal+col] += A.offd.vals[m];
+    }
+  }
+
+  //assemble the full matrix
+  memory<dfloat> gA(coarseTotal*coarseTotal);
+
+  for (int r=0;r<size;++r) {
+    coarseCounts[r] *= coarseTotal;
+    coarseOffsets[r] *= coarseTotal;
+  }
+
+  comm.Allgatherv(localA, N*coarseTotal,
+                      gA, coarseCounts, coarseOffsets);
+  localA.free();
+
+  for (int r=0;r<size;++r) {
+    coarseCounts[r]  /= coarseTotal;
+    coarseOffsets[r] /= coarseTotal;
+  }
+
+  linAlg_t::matrixInverse(coarseTotal, gA);
+
+  //diag piece of invA
+  invA.malloc(N*coarseTotal);
+
+  #pragma omp parallel for
+  for (int n=0;n<N;n++) {
+    for (int m=0;m<coarseTotal;m++) {
+      invA[n*coarseTotal+m] = gA[(n+A.rowOffsetL)*coarseTotal+m];
+    }
+  }
+
+  /*Space for global rhs*/
+  grhs.malloc(coarseTotal);
+}
+
+} //namespace paradogs
+
+} //namespace libp
diff --git a/libs/parAdogs/parAdogsMultigridLaplacian.cpp b/libs/parAdogs/parAdogsMultigridLaplacian.cpp
new file mode 100644
index 000000000..2fb1b89ed
--- /dev/null
+++ b/libs/parAdogs/parAdogsMultigridLaplacian.cpp
@@ -0,0 +1,166 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsGraph.hpp"
+#include "parAdogs/parAdogsMultigrid.hpp"
+
+namespace libp {
+
+namespace paradogs {
+
+/*Create graph Laplacian from mesh data*/
+void graph_t::CreateLaplacian() {
+
+  Nlevels=1;
+  L[0].A = parCSR(Nverts, Nverts, platform, comm);
+  parCSR& A = L[0].A;
+
+  A.rowOffsetL = VoffsetL;
+  A.rowOffsetU = VoffsetU;
+  A.colOffsetL = VoffsetL;
+  A.colOffsetU = VoffsetU;
+
+  /*Create a graph Laplacian from mesh info*/
+  A.diag.rowStarts.malloc(Nverts+1);
+  A.offd.rowStarts.malloc(Nverts+1);
+
+  #pragma omp parallel for
+  for (dlong n=0;n<Nverts+1;++n) {
+    A.diag.rowStarts[n] = 0;
+    A.offd.rowStarts[n] = 0;
+  }
+
+  for (dlong e=0;e<Nverts;++e) {
+    A.diag.rowStarts[e+1]++; /*Count diagonal*/
+
+    for (int n=0;n<Nfaces;++n) {
+      const hlong gE = elements[e].E[n];
+      if (gE!=-1) {
+        if (gE>=VoffsetL && gE<VoffsetU) {
+          A.diag.rowStarts[e+1]++; /*count connections per vert*/
+        } else {
+          A.offd.rowStarts[e+1]++; /*count connections per vert*/
+        }
+      }
+    }
+  }
+
+  // count how many rows are shared
+  A.offd.nzRows=0;
+  for(dlong e=0;e<Nverts; e++) {
+    if (A.offd.rowStarts[e+1]>0) A.offd.nzRows++;
+  }
+
+  A.offd.rows.malloc(A.offd.nzRows);
+  A.offd.mRowStarts.malloc(A.offd.nzRows+1);
+
+  /*cumulative sum*/
+  dlong cnt=0;
+  A.offd.mRowStarts[0]=0;
+  for (dlong e=0;e<Nverts;++e) {
+    if (A.offd.rowStarts[e+1]>0) {
+      A.offd.rows[cnt] = e; //record row id
+      A.offd.mRowStarts[cnt+1] = A.offd.mRowStarts[cnt] + A.offd.rowStarts[e+1];
+      cnt++;
+    }
+    A.diag.rowStarts[e+1] += A.diag.rowStarts[e];
+    A.offd.rowStarts[e+1] += A.offd.rowStarts[e];
+  }
+  A.diag.nnz = A.diag.rowStarts[Nverts];
+  A.offd.nnz = A.offd.rowStarts[Nverts];
+
+  /*Halo setup*/
+  cnt=0;
+  colIds.malloc(A.offd.nnz);
+  for (dlong e=0;e<Nverts;++e) {
+    for (int n=0;n<Nfaces;++n) {
+      const hlong gE = elements[e].E[n];
+      if (gE!=-1) {
+        if (gE<VoffsetL || gE>=VoffsetU) {
+          colIds[cnt++] = gE;
+        }
+      }
+    }
+  }
+  A.haloSetup(colIds); //setup halo, and transform colIds to a local indexing
+  Nhalo = A.Ncols-A.Nrows; /*Record how big the halo region is*/
+
+  /*Build connectivity*/
+  A.diagA.malloc(A.Ncols);
+  A.diagInv.malloc(A.Nrows);
+  A.diag.cols.malloc(A.diag.nnz);
+  A.offd.cols.malloc(A.offd.nnz);
+  A.diag.vals.malloc(A.diag.nnz);
+  A.offd.vals.malloc(A.offd.nnz);
+
+  A.diag.nnz=0;
+  A.offd.nnz=0;
+  for (dlong e=0;e<Nverts;++e) {
+    A.diag.cols[A.diag.nnz] = e;
+    pfloat& Ann = A.diag.vals[A.diag.nnz];
+    A.diag.nnz++;
+
+    Ann = 0.0;
+
+    for (int n=0;n<Nfaces;++n) {
+      const hlong gE = elements[e].E[n];
+      if (gE!=-1) {
+        if (gE>=VoffsetL && gE<VoffsetU) {
+          A.diag.cols[A.diag.nnz] = static_cast<dlong>(gE-VoffsetL);
+          A.diag.vals[A.diag.nnz] = -1.0;
+          A.diag.nnz++;
+        } else {
+          A.offd.cols[A.offd.nnz] = colIds[A.offd.nnz];
+          A.offd.vals[A.offd.nnz] = -1.0;
+          A.offd.nnz++;
+        }
+        Ann += 1.0;
+      }
+    }
+    A.diagA[e] = Ann;
+    A.diagInv[e] = 1.0/Ann;
+  }
+
+  //fill the halo region
+  A.halo.Exchange(A.diagA, 1);
+
+  L[0].Nrows = A.Nrows;
+  L[0].Ncols = A.Ncols;
+  L[0].Nglobal = NVertsGlobal;
+
+  /*Construct fine null vector*/
+  L[0].null.malloc(Nverts);
+
+  #pragma omp parallel for
+  for (dlong n=0;n<Nverts;++n) {
+    L[0].null[n] = 1.0/sqrt(NVertsGlobal);
+  }
+}
+
+} //namespace paradogs
+
+} //namespace libp
diff --git a/libs/parAdogs/parAdogsMultigridSetup.cpp b/libs/parAdogs/parAdogsMultigridSetup.cpp
new file mode 100644
index 000000000..3a111ab30
--- /dev/null
+++ b/libs/parAdogs/parAdogsMultigridSetup.cpp
@@ -0,0 +1,190 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsGraph.hpp"
+
+namespace libp {
+
+namespace paradogs {
+
+/****************************************/
+/* Construct Multigrid Hierarchy        */
+/****************************************/
+void graph_t::MultigridSetup() {
+
+  CreateLaplacian();
+
+  /*Target size for coarsest graph*/
+  const int coarseSize = 100;
+
+  /*Coarsening tolerance. If a coarse graph isn't at least
+    coarseTol times smaller than a fine graph, we consider the coarsening
+    to be stalling*/
+  const float coarseTol = 0.8;
+
+  /*Stength threashold*/
+  dfloat theta=0.08;
+
+  do{
+    /*Get coarsest level*/
+    mgLevel_t& Lf = L[Nlevels-1];
+
+    /*If the graph is small enough, we're done*/
+    if (Lf.Nglobal <= coarseSize) {
+      coarseSolver.Setup(Lf.A, Lf.null);
+      break;
+    }
+
+    LIBP_ABORT("Paradogs: Max levels exceeded in coarse graph creation. Increase MAX_LEVELS.",
+               Nlevels>=MAX_LEVELS);
+
+    Lf.SetupSmoother();
+
+    /*Construct next level via coarsening*/
+    mgLevel_t& Lc = L[Nlevels];    
+    Lc.CoarsenLevel(Lf, theta);
+    Nlevels++;
+    
+    // Increase coarsening rate as we add levels.
+    //See: Algebraic Multigrid On Unstructured Meshes, P Vanek, J. Mandel, M. Brezina.
+    theta=theta/2;
+
+    /*Check for stalls*/
+    if (Lc.Nglobal > coarseTol*Lf.Nglobal) {
+      LIBP_FORCE_WARNING("Paradogs: Graph coarsening stalling. Coarse graph has " << Lc.Nglobal << " nodes.");
+      coarseSolver.Setup(Lc.A, Lc.null);
+      break;
+    }
+  } while(true);
+
+  for (int l=0;l<Nlevels;++l) {
+    L[l].AllocateScratch(l);
+  }
+}
+
+void mgLevel_t::SetupSmoother() {
+
+  // estimate rho(invD * A)
+  A.rho = A.rhoDinvA(null);
+
+  /*Smoothing params*/
+  lambda1 = A.rho;
+  lambda0 = A.rho/10.;
+
+}
+
+void mgLevel_t::AllocateScratch(const int l) {
+
+  /*Space for Fiedler*/
+  Fiedler.malloc(Ncols);
+
+  RES.malloc(Ncols);
+
+  if (l>0) {
+    /*Multigrid buffers*/
+    RHS.malloc(Nrows);
+    X.malloc(Ncols);
+  }
+
+  /*Scratch space*/
+  scratch.malloc(2*Ncols);
+}
+
+
+
+/*Coarsen a graph using an aggregation*/
+void mgLevel_t::CoarsenLevel(mgLevel_t& Lf, const dfloat theta) {
+
+  /*Create a FineToCoarse mapping*/
+  const dlong Nf = Lf.Nrows;
+
+  /*Create a vertex matching*/
+  dlong Nc=0;
+  memory<hlong> FineToCoarse(Lf.Ncols);
+  Lf.A.Aggregate(Nc, theta, FineToCoarse);
+
+  /* Tentative prolongation operator*/
+  parCSR T = TentativeProlongator(Nf, Nc,
+                                  Lf.A.platform, Lf.A.comm,
+                                  FineToCoarse,
+                                  Lf.null, null);
+  FineToCoarse.free();
+
+  /* Smoothed prologontion */
+  Lf.P = SmoothProlongator(Lf.A, T);
+  T = parCSR(); //Free T
+
+  /* R = P^T*/
+  Lf.R = Transpose(Lf.P);
+  Lf.Ncols = std::max(Lf.Ncols, Lf.R.Ncols);
+
+  /*Galerkin product*/
+  parCSR AP = SpMM(Lf.A, Lf.P);
+  A = SpMM(Lf.R, AP);
+  // A.GalerkinProduct(Lf.A, Lf.P);
+  AP= parCSR(); //Free AP
+
+  /*fill diagonal*/
+  A.diagA.malloc(A.Ncols);
+  A.diagInv.malloc(A.Nrows);
+
+  #pragma omp parallel for
+  for (dlong i=0;i<A.Nrows;i++) {
+    const dlong start = A.diag.rowStarts[i];
+    const dlong end   = A.diag.rowStarts[i+1];
+
+    for (dlong j=start;j<end;j++) {
+      //record the diagonal
+      if (A.diag.cols[j]==i) {
+        A.diagA[i] = A.diag.vals[j];
+        A.diagInv[i] = 1.0/A.diagA[i];
+        break;
+      }
+    }
+  }
+
+  //fill the halo region
+  A.halo.Exchange(A.diagA, 1);
+
+  Nrows = A.Nrows;
+  Ncols = std::max(A.Ncols, Lf.P.Ncols);
+
+  Nglobal = static_cast<hlong>(Nrows);
+  A.comm.Allreduce(Nglobal);
+}
+
+/*Free coarse levels of hierarchy*/
+void graph_t::MultigridDestroy() {
+  colIds.free();
+  coarseSolver = coarseSolver_t();
+  for (int n=Nlevels-1;n>=0;--n) L[n] = mgLevel_t();
+  Nlevels=0;
+}
+
+} //namespace paradogs
+
+} //namespace libp
diff --git a/libs/parAdogs/parAdogsMultigridSmooth.cpp b/libs/parAdogs/parAdogsMultigridSmooth.cpp
new file mode 100644
index 000000000..95752d187
--- /dev/null
+++ b/libs/parAdogs/parAdogsMultigridSmooth.cpp
@@ -0,0 +1,177 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsMatrix.hpp"
+#include "parAdogs/parAdogsPartition.hpp"
+
+namespace libp {
+
+namespace paradogs {
+
+void parCSR::SmoothChebyshev(memory<dfloat>& b, memory<dfloat>& x,
+                             const dfloat lambda0, const dfloat lambda1,
+                             const bool xIsZero, memory<dfloat>& scratch,
+                             const int ChebyshevIterations) {
+
+  const dfloat theta = 0.5*(lambda1+lambda0);
+  const dfloat delta = 0.5*(lambda1-lambda0);
+  const dfloat invTheta = 1.0/theta;
+  const dfloat sigma = theta/delta;
+  dfloat rho_n = 1./sigma;
+  dfloat rho_np1;
+
+  memory<dfloat> d = scratch + 0*Ncols;
+  memory<dfloat> r = scratch + 1*Ncols;
+
+  if(xIsZero){ //skip the Ax if x is zero
+    // r = D^{-1}b
+    // d = invTheta*r
+    // x = d
+    #pragma omp parallel for
+    for (dlong n=0;n<Nrows;++n) {
+      const dfloat r_r = diagInv[n]*b[n];
+      r[n] = r_r;
+      d[n] = invTheta*r_r;
+      x[n] = invTheta*r_r;
+    }
+
+  } else {
+    halo.ExchangeStart(x, 1);
+
+    //r = D^{-1}(b-A*x)
+    #pragma omp parallel for
+    for (dlong n=0;n<Nrows;++n) {
+
+      dfloat rn = b[n];
+
+      const dlong start = diag.rowStarts[n];
+      const dlong end   = diag.rowStarts[n+1];
+      for (dlong j=start;j<end;++j) {
+        rn -= diag.vals[j]*x[diag.cols[j]];
+      }
+
+      r[n] = diagInv[n]*rn;
+    }
+
+    halo.ExchangeFinish(x, 1);
+
+    #pragma omp parallel for
+    for(dlong n=0; n<offd.nzRows; n++){ //local
+
+      dfloat rn = 0.0;
+
+      const dlong row = offd.rows[n];
+      const dlong start = offd.mRowStarts[n];
+      const dlong end   = offd.mRowStarts[n+1];
+      for(dlong j=start; j<end; ++j) {
+        rn -= offd.vals[j]*x[offd.cols[j]];
+      }
+
+      r[row] += diagInv[row]*rn;
+    }
+
+    const int last_it = (ChebyshevIterations==0) ? 1 : 0;
+
+    //d = invTheta*r
+    //x = x + d
+    if (last_it) {
+      #pragma omp parallel for
+      for (dlong n=0;n<Nrows;++n) {
+        x[n] += invTheta*r[n];
+      }
+    } else {
+      #pragma omp parallel for
+      for (dlong n=0;n<Nrows;++n) {
+        d[n] = invTheta*r[n];
+        x[n] += d[n];
+      }
+    }
+  }
+
+  for (int k=0;k<ChebyshevIterations;k++) {
+
+    halo.ExchangeStart(d, 1);
+
+    //r_k+1 = r_k - D^{-1}Ad_k
+    #pragma omp parallel for
+    for (dlong n=0;n<Nrows;++n) {
+
+      dfloat rn = 0.0;
+
+      const dlong start = diag.rowStarts[n];
+      const dlong end   = diag.rowStarts[n+1];
+      for (dlong j=start;j<end;++j) {
+        rn -= diag.vals[j]*d[diag.cols[j]];
+      }
+
+      r[n] += diagInv[n]*rn;
+    }
+
+    halo.ExchangeFinish(d, 1);
+
+    #pragma omp parallel for
+    for(dlong n=0; n<offd.nzRows; n++){ //local
+
+      dfloat rn = 0.0;
+
+      const dlong row = offd.rows[n];
+      const dlong start = offd.mRowStarts[n];
+      const dlong end   = offd.mRowStarts[n+1];
+      for(dlong j=start; j<end; ++j) {
+        rn -= offd.vals[j]*d[offd.cols[j]];
+      }
+
+      r[row] += diagInv[row]*rn;
+    }
+
+    const int last_it = (k==ChebyshevIterations-1) ? 1 : 0;
+
+    rho_np1 = 1.0/(2.*sigma-rho_n);
+
+    //d_k+1 = rho_k+1*rho_k*d_k  + 2*rho_k+1*r_k+1/delta
+    //x_k+1 = x_k + d_k+1
+    if (last_it) {
+      #pragma omp parallel for
+      for (dlong n=0;n<Nrows;++n) {
+        const dfloat d_np1 = (rho_np1*rho_n)*d[n] + (2.0*rho_np1/delta)*r[n];
+        x[n] += d_np1;
+      }
+    } else {
+      #pragma omp parallel for
+      for (dlong n=0;n<Nrows;++n) {
+        d[n] = (rho_np1*rho_n)*d[n] + (2.0*rho_np1/delta)*r[n];
+        x[n] += d[n];
+      }
+    }
+
+    rho_n = rho_np1;
+  }
+}
+
+} //namespace paradogs
+
+} //namespace libp
diff --git a/libs/parAdogs/parAdogsMultigridSmoothPrologator.cpp b/libs/parAdogs/parAdogsMultigridSmoothPrologator.cpp
new file mode 100644
index 000000000..b422ddef5
--- /dev/null
+++ b/libs/parAdogs/parAdogsMultigridSmoothPrologator.cpp
@@ -0,0 +1,332 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsGraph.hpp"
+
+namespace libp {
+
+namespace paradogs {
+
+parCSR SmoothProlongator(const parCSR& A, const parCSR& T) {
+
+  // MPI info
+  int size = A.comm.size();
+
+  // This function computes a smoothed prologation operator
+  // via a single weighted Jacobi iteration on the tentative
+  // prologator, i.e.,
+  //
+  //   P = (I - omega*D^{-1}*A)*T
+  //
+  // To compute D^{-1}*A*T we need all the rows T(j,:) for which
+  // j is a column index for the nonzeros of A on this rank.
+  // For all local column indices in A.diag, we will already
+  // have the row of T on this rank, so we just need to gather
+  // the offd colIds
+
+  //Jacobi weight
+  const dfloat omega = (4./3.)/A.rho;
+
+  memory<hlong> recvRows(A.Ncols-A.NlocalCols);
+  memory<int> sendCounts(size);
+  memory<int> recvCounts(size, 0);
+  memory<int> sendOffsets(size+1);
+  memory<int> recvOffsets(size+1);
+
+  memory<hlong> globalRowStarts(size+1);
+  globalRowStarts[0]=0;
+  T.comm.Allgather(T.rowOffsetU, globalRowStarts+1);
+
+  //use the colMap of A to list the needed rows of T
+  int r=0;
+  for (dlong n=A.NlocalCols;n<A.Ncols;n++) {
+    const hlong id = A.colMap[n];
+    while (id>=globalRowStarts[r+1]) r++; //assumes the halo is sorted
+    recvCounts[r]++;
+    recvRows[n-A.NlocalCols] = id; //record the row to recv
+  }
+  globalRowStarts.free();
+
+  //share the counts
+  A.comm.Alltoall(recvCounts, sendCounts);
+
+  sendOffsets[0]=0;
+  recvOffsets[0]=0;
+  for (r=0;r<size;r++) {
+    sendOffsets[r+1] = sendOffsets[r]+sendCounts[r];
+    recvOffsets[r+1] = recvOffsets[r]+recvCounts[r];
+  }
+
+  int sendTotal = sendOffsets[size];
+  memory<hlong> sendRows(sendTotal);
+
+  //share the rowIds
+  T.comm.Alltoallv(recvRows, recvCounts, recvOffsets,
+                   sendRows, sendCounts, sendOffsets);
+
+  //we now have a list of rows to send, count the nnz to send
+  dlong nnzTotal=0;
+  for (r=0;r<size;r++) {
+    sendCounts[r] =0; //reset
+    for (int n=sendOffsets[r];n<sendOffsets[r+1];n++) {
+      const dlong i = static_cast<dlong>(sendRows[n]-T.rowOffsetL); //local row id
+      sendCounts[r]+= T.diag.rowStarts[i+1]-T.diag.rowStarts[i]; //count entries in this row
+      sendCounts[r]+= T.offd.rowStarts[i+1]-T.offd.rowStarts[i]; //count entries in this row
+    }
+    nnzTotal += sendCounts[r]; //tally the total
+  }
+
+  memory<nonZero_t> sendNonZeros(nnzTotal);
+
+  nnzTotal=0; //reset
+  for (r=0;r<size;r++) {
+    for (int n=sendOffsets[r];n<sendOffsets[r+1];n++) {
+      const dlong i = static_cast<dlong>(sendRows[n] - T.rowOffsetL); //local row id
+      for (dlong jj=T.diag.rowStarts[i]; jj<T.diag.rowStarts[i+1];jj++){
+        sendNonZeros[nnzTotal].row = sendRows[n];
+        sendNonZeros[nnzTotal].col = T.diag.cols[jj] + T.colOffsetL;
+        sendNonZeros[nnzTotal].val = T.diag.vals[jj];
+        nnzTotal++;
+      }
+      for (dlong jj=T.offd.rowStarts[i]; jj<T.offd.rowStarts[i+1];jj++){
+        sendNonZeros[nnzTotal].row = sendRows[n];
+        sendNonZeros[nnzTotal].col = T.colMap[T.offd.cols[jj]];
+        sendNonZeros[nnzTotal].val = T.offd.vals[jj];
+        nnzTotal++;
+      }
+    }
+  }
+
+  A.comm.Alltoall(sendCounts, recvCounts);
+
+  for (r=0;r<size;r++) {
+    sendOffsets[r+1] = sendOffsets[r]+sendCounts[r];
+    recvOffsets[r+1] = recvOffsets[r]+recvCounts[r];
+  }
+
+
+  dlong Toffdnnz = recvOffsets[size]; //total nonzeros
+  memory<nonZero_t> ToffdRows(Toffdnnz);
+
+  T.comm.Alltoallv(sendNonZeros, sendCounts, sendOffsets,
+                      ToffdRows, recvCounts, recvOffsets);
+
+  //clean up
+  sendNonZeros.free();
+  sendRows.free();
+  recvRows.free();
+  sendCounts.free();
+  recvCounts.free();
+  sendOffsets.free();
+  recvOffsets.free();
+
+  //we now have all the needed nonlocal rows (should also be sorted by row then col)
+
+  //make an array of row offsets so we know how large each row is
+  memory<dlong> ToffdRowOffsets(A.Ncols-A.NlocalCols+1, 0);
+
+  dlong id=0;
+  for (dlong n=0;n<Toffdnnz;n++) {
+    hlong row = ToffdRows[n].row;
+
+    while(A.colMap[id+A.NlocalCols]!=row) id++;
+
+    ToffdRowOffsets[id+1]++; //count entry in row
+  }
+
+  //cumulative sum
+  for (dlong n=0;n<A.Ncols-A.NlocalCols;n++)
+    ToffdRowOffsets[n+1] += ToffdRowOffsets[n];
+
+
+  // The next step to compute D^{-1}*A*T is to multiply each entry A(i,j) by the
+  // row T(j,:), store the all the results, sort them by row+col, and compress
+  // the entries
+
+  // Find how big the intermediate form is
+  memory<dlong> rowStarts(A.Nrows+1, 0);
+  memory<dlong> rowCounts(A.Nrows, 0);
+
+  /*Count entries per row*/
+  #pragma omp parallel for
+  for(dlong i=0; i<A.Nrows; i++) {
+    /*Start with entries for T*/
+    rowStarts[i+1]+=T.diag.rowStarts[i+1]-T.diag.rowStarts[i] +
+                    T.offd.rowStarts[i+1]-T.offd.rowStarts[i];
+
+    /*Then add entries from A*T*/
+    dlong Jstart = A.diag.rowStarts[i];
+    dlong Jend   = A.diag.rowStarts[i+1];
+    for(dlong jj=Jstart; jj<Jend; jj++){
+      const dlong col = A.diag.cols[jj];
+      rowStarts[i+1]+=T.diag.rowStarts[col+1]-T.diag.rowStarts[col] +
+                      T.offd.rowStarts[col+1]-T.offd.rowStarts[col];
+    }
+    //non-local entries
+    Jstart = A.offd.rowStarts[i];
+    Jend   = A.offd.rowStarts[i+1];
+    for (dlong jj=Jstart;jj<Jend;jj++) {
+      const dlong col = A.offd.cols[jj]-A.NlocalCols;
+      rowStarts[i+1]+= ToffdRowOffsets[col+1] - ToffdRowOffsets[col];
+    }
+  }
+
+  /*Cumulative sum*/
+  for(dlong i=1; i<A.Nrows+1; i++) {
+    rowStarts[i] += rowStarts[i-1];
+  }
+
+  dlong NNZ = rowStarts[A.Nrows];
+
+  memory<nonZero_t> Ptmp(NNZ);
+
+  //count total number of nonzeros we find
+  dlong nnz =0;
+
+  // Fill the intermediate form of P
+  // #pragma omp parallel for
+  for (dlong i=0;i<A.Nrows;i++) {
+    const dlong cStart = rowStarts[i];
+    dlong& c = rowCounts[i];
+
+    /*Start with P=T entries*/
+
+    //local T entries
+    dlong start = T.diag.rowStarts[i];
+    dlong end   = T.diag.rowStarts[i+1];
+    for (dlong j=start;j<end;j++) {
+      Ptmp[cStart+c].row = i + T.rowOffsetL;
+      Ptmp[cStart+c].col = T.diag.cols[j] + T.colOffsetL; //global id
+      Ptmp[cStart+c].val = T.diag.vals[j];
+      c++;
+    }
+    //non-local T entries
+    start = T.offd.rowStarts[i];
+    end   = T.offd.rowStarts[i+1];
+    for (dlong j=start;j<end;j++) {
+      Ptmp[cStart+c].row = i + T.rowOffsetL;
+      Ptmp[cStart+c].col = T.colMap[T.offd.cols[j]];
+      Ptmp[cStart+c].val = T.offd.vals[j];
+      c++;
+    }
+
+    /*Then P -= omega*invD*A*T*/
+
+    //local A entries
+    start = A.diag.rowStarts[i];
+    end   = A.diag.rowStarts[i+1];
+
+    const dfloat invDi = 1.0/A.diagA[i];
+
+    for (dlong j=start;j<end;j++) {
+      const dlong col = A.diag.cols[j];
+      const dfloat Aval = -omega*invDi*A.diag.vals[j];
+
+      //local T entries
+      dlong Tstart = T.diag.rowStarts[col];
+      dlong Tend   = T.diag.rowStarts[col+1];
+      for (dlong jj=Tstart;jj<Tend;jj++) {
+        Ptmp[cStart+c].row = i + A.rowOffsetL;
+        Ptmp[cStart+c].col = T.diag.cols[jj] + T.colOffsetL; //global id
+        Ptmp[cStart+c].val = Aval*T.diag.vals[jj];
+        c++;
+      }
+      //non-local T entries
+      Tstart = T.offd.rowStarts[col];
+      Tend   = T.offd.rowStarts[col+1];
+      for (dlong jj=Tstart;jj<Tend;jj++) {
+        Ptmp[cStart+c].row = i + A.rowOffsetL;
+        Ptmp[cStart+c].col = T.colMap[T.offd.cols[jj]]; //global id
+        Ptmp[cStart+c].val = Aval*T.offd.vals[jj];
+        c++;
+      }
+    }
+    //non-local A entries
+    start = A.offd.rowStarts[i];
+    end   = A.offd.rowStarts[i+1];
+    for (dlong j=start;j<end;j++) {
+      const dlong col = A.offd.cols[j]-A.NlocalCols;
+      const dfloat Aval = -omega*invDi*A.offd.vals[j];
+
+      // entries from recived rows of T
+      dlong Tstart = ToffdRowOffsets[col];
+      dlong Tend   = ToffdRowOffsets[col+1];
+      for (dlong jj=Tstart;jj<Tend;jj++) {
+        Ptmp[cStart+c].row = i + A.rowOffsetL;
+        Ptmp[cStart+c].col = ToffdRows[jj].col; //global id
+        Ptmp[cStart+c].val = Aval*ToffdRows[jj].val;
+        c++;
+      }
+    }
+
+    //sort entries in this row by col id
+    std::sort(Ptmp.ptr()+cStart, Ptmp.ptr()+cStart+c,
+              [](const nonZero_t& a, const nonZero_t& b) {
+                return a.col < b.col;
+              });
+
+    /*Count how many actual nonzeros will be in this row*/
+    dlong nnzRow=0;
+    if (c>0) nnzRow++;
+    for (dlong j=1;j<c;j++) {
+      if ((Ptmp[cStart+j].col!=Ptmp[cStart+j-1].col)) nnzRow++;
+    }
+
+    nnz+=nnzRow; //Add to total
+  }
+  ToffdRowOffsets.free();
+  ToffdRows.free();
+
+  rowStarts.free();
+  rowCounts.free();
+
+  // cooP.nnz = nnz;
+  memory<nonZero_t> entries(nnz);
+
+  //compress nonzeros
+  nnz = 0;
+  if (NNZ) entries[nnz++] = Ptmp[0];
+  for (dlong i=1;i<NNZ;i++) {
+    if ((Ptmp[i].row!=Ptmp[i-1].row)||
+        (Ptmp[i].col!=Ptmp[i-1].col)) {
+      entries[nnz++] = Ptmp[i];
+    } else {
+      entries[nnz-1].val += Ptmp[i].val;
+    }
+  }
+  //clean up
+  Ptmp.free();
+
+  //build P from coo matrix
+  return parCSR(A.Nrows, T.NlocalCols,
+                nnz, entries,
+                A.platform, A.comm);
+}
+
+} //namespace paradogs
+
+} //namespace libp
diff --git a/libs/parAdogs/parAdogsMultigridSpMM.cpp b/libs/parAdogs/parAdogsMultigridSpMM.cpp
new file mode 100644
index 000000000..e0cf50671
--- /dev/null
+++ b/libs/parAdogs/parAdogsMultigridSpMM.cpp
@@ -0,0 +1,294 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsMatrix.hpp"
+#include "parAdogs/parAdogsPartition.hpp"
+
+namespace libp {
+
+namespace paradogs {
+
+parCSR SpMM(const parCSR& A, const parCSR& B){
+
+  // MPI info
+  int size = A.comm.size();
+
+  // To compute C = A*B we need all the rows B(j,:) for which
+  // j is a column index for the nonzeros of A on this rank.
+  // For all local column indices in A.diag, we will already
+  // have the row of B on this rank, so we just need to gather
+  // the offd colIds
+
+  memory<hlong> recvRows(A.Ncols-A.NlocalCols);
+  memory<int> sendCounts(size);
+  memory<int> recvCounts(size, 0);
+  memory<int> sendOffsets(size+1);
+  memory<int> recvOffsets(size+1);
+
+  memory<hlong> globalRowStarts(size+1);
+  globalRowStarts[0]=0;
+  B.comm.Allgather(B.rowOffsetU, globalRowStarts+1);
+
+  //use the colMap of A to list the needed rows of B
+  int r=0;
+  for (dlong n=A.NlocalCols;n<A.Ncols;n++) {
+    const hlong id = A.colMap[n];
+    while (id>=globalRowStarts[r+1]) r++; //assumes the halo is sorted
+    recvCounts[r]++;
+    recvRows[n-A.NlocalCols] = id; //record the row to recv
+  }
+  globalRowStarts.free();
+
+  //share the counts
+  A.comm.Alltoall(recvCounts, sendCounts);
+
+  sendOffsets[0]=0;
+  recvOffsets[0]=0;
+  for (r=0;r<size;r++) {
+    sendOffsets[r+1] = sendOffsets[r]+sendCounts[r];
+    recvOffsets[r+1] = recvOffsets[r]+recvCounts[r];
+  }
+
+  int sendTotal = sendOffsets[size];
+  memory<hlong> sendRows(sendTotal);
+
+  //share the rowIds
+  B.comm.Alltoallv(recvRows, recvCounts, recvOffsets,
+                   sendRows, sendCounts, sendOffsets);
+
+  //we now have a list of rows to send, count the nnz to send
+  dlong NNZ=0;
+  for (r=0;r<size;r++) {
+    sendCounts[r] =0; //reset
+    for (int n=sendOffsets[r];n<sendOffsets[r+1];n++) {
+      const dlong i = static_cast<dlong>(sendRows[n]-B.rowOffsetL); //local row id
+      sendCounts[r]+= B.diag.rowStarts[i+1]-B.diag.rowStarts[i]; //count entries in this row
+      sendCounts[r]+= B.offd.rowStarts[i+1]-B.offd.rowStarts[i]; //count entries in this row
+    }
+    NNZ += sendCounts[r]; //tally the total
+  }
+
+  memory<nonZero_t> sendNonZeros(NNZ);
+
+  NNZ=0; //reset
+  for (r=0;r<size;r++) {
+    for (int n=sendOffsets[r];n<sendOffsets[r+1];n++) {
+      const dlong i = static_cast<dlong>(sendRows[n] - B.rowOffsetL); //local row id
+      for (dlong jj=B.diag.rowStarts[i]; jj<B.diag.rowStarts[i+1];jj++){
+        sendNonZeros[NNZ].row = sendRows[n];
+        sendNonZeros[NNZ].col = B.diag.cols[jj] + B.colOffsetL;
+        sendNonZeros[NNZ].val = B.diag.vals[jj];
+        NNZ++;
+      }
+      for (dlong jj=B.offd.rowStarts[i]; jj<B.offd.rowStarts[i+1];jj++){
+        sendNonZeros[NNZ].row = sendRows[n];
+        sendNonZeros[NNZ].col = B.colMap[B.offd.cols[jj]];
+        sendNonZeros[NNZ].val = B.offd.vals[jj];
+        NNZ++;
+      }
+    }
+  }
+
+  A.comm.Alltoall(sendCounts, recvCounts);
+
+  for (r=0;r<size;r++) {
+    sendOffsets[r+1] = sendOffsets[r]+sendCounts[r];
+    recvOffsets[r+1] = recvOffsets[r]+recvCounts[r];
+  }
+
+
+  dlong Boffdnnz = recvOffsets[size]; //total nonzeros
+  memory<nonZero_t> BoffdRows(Boffdnnz);
+
+  B.comm.Alltoallv(sendNonZeros, sendCounts, sendOffsets,
+                      BoffdRows, recvCounts, recvOffsets);
+
+  //clean up
+  sendNonZeros.free();
+  sendRows.free();
+  recvRows.free();
+  sendCounts.free();
+  recvCounts.free();
+  sendOffsets.free();
+  recvOffsets.free();
+
+  //we now have all the needed nonlocal rows (should also be sorted by row then col)
+
+  //make an array of row offsets so we know how large each row is
+  memory<dlong> BoffdRowOffsets(A.Ncols-A.NlocalCols+1, 0);
+
+  dlong id=0;
+  for (dlong n=0;n<Boffdnnz;n++) {
+    hlong row = BoffdRows[n].row;
+
+    while(A.colMap[id+A.NlocalCols]!=row) id++;
+
+    BoffdRowOffsets[id+1]++; //count entry in row
+  }
+
+  //cumulative sum
+  for (dlong n=0;n<A.Ncols-A.NlocalCols;n++)
+    BoffdRowOffsets[n+1] += BoffdRowOffsets[n];
+
+
+  // The next step to compute C = A*B is to multiply each entry A(i,j) by the
+  // row B(j,:), store the all the results, sort them by row+col, and compress
+  // the entries
+
+  // Find how big the intermediate form is
+  memory<dlong> rowStarts(A.Nrows+1, 0);
+  memory<dlong> rowCounts(A.Nrows, 0);
+
+  /*Count entries per row*/
+  #pragma omp parallel for
+  for (dlong i=0;i<A.Nrows;i++) {
+    //local entries
+    dlong start = A.diag.rowStarts[i];
+    dlong end   = A.diag.rowStarts[i+1];
+    for (dlong j=start;j<end;j++) {
+      const dlong col = A.diag.cols[j];
+      rowStarts[i+1] +=  B.diag.rowStarts[col+1]-B.diag.rowStarts[col]
+                        +B.offd.rowStarts[col+1]-B.offd.rowStarts[col];
+    }
+    //non-local entries
+    start = A.offd.rowStarts[i];
+    end   = A.offd.rowStarts[i+1];
+    for (dlong j=start;j<end;j++) {
+      const dlong col = A.offd.cols[j]-A.NlocalCols;
+      rowStarts[i+1] += BoffdRowOffsets[col+1] - BoffdRowOffsets[col];
+    }
+  }
+
+  /*Cumulative sum*/
+  for(dlong i=1; i<A.Nrows+1; i++) {
+    rowStarts[i] += rowStarts[i-1];
+  }
+
+  NNZ = rowStarts[A.Nrows];
+
+  memory<nonZero_t> Ctmp(NNZ);
+
+  //count total number of nonzeros;
+  dlong nnz =0;
+
+  // Fill the intermediate form of C
+  // #pragma omp parallel for
+  for (dlong i=0;i<A.Nrows;i++) {
+    const dlong cStart = rowStarts[i];
+    dlong& c = rowCounts[i];
+
+    //local A entries
+    dlong start = A.diag.rowStarts[i];
+    dlong end   = A.diag.rowStarts[i+1];
+    for (dlong j=start;j<end;j++) {
+      const dlong col = A.diag.cols[j];
+      const dfloat Aval = A.diag.vals[j];
+
+      //local B entries
+      dlong Bstart = B.diag.rowStarts[col];
+      dlong Bend   = B.diag.rowStarts[col+1];
+      for (dlong jj=Bstart;jj<Bend;jj++) {
+        Ctmp[cStart+c].row = i + A.rowOffsetL;
+        Ctmp[cStart+c].col = B.diag.cols[jj] + B.colOffsetL; //global id
+        Ctmp[cStart+c].val = Aval*B.diag.vals[jj];
+        c++;
+      }
+      //non-local B entries
+      Bstart = B.offd.rowStarts[col];
+      Bend   = B.offd.rowStarts[col+1];
+      for (dlong jj=Bstart;jj<Bend;jj++) {
+        Ctmp[cStart+c].row = i + A.rowOffsetL;
+        Ctmp[cStart+c].col = B.colMap[B.offd.cols[jj]]; //global id
+        Ctmp[cStart+c].val = Aval*B.offd.vals[jj];
+        c++;
+      }
+    }
+    //non-local A entries
+    start = A.offd.rowStarts[i];
+    end   = A.offd.rowStarts[i+1];
+    for (dlong j=start;j<end;j++) {
+      const dlong col = A.offd.cols[j]-A.NlocalCols;
+      const dfloat Aval = A.offd.vals[j];
+
+      // entries from recived rows of B
+      dlong Bstart = BoffdRowOffsets[col];
+      dlong Bend   = BoffdRowOffsets[col+1];
+      for (dlong jj=Bstart;jj<Bend;jj++) {
+        Ctmp[cStart+c].row = i + A.rowOffsetL;
+        Ctmp[cStart+c].col = BoffdRows[jj].col; //global id
+        Ctmp[cStart+c].val = Aval*BoffdRows[jj].val;
+        c++;
+      }
+    }
+
+    //sort entries in this row by col id
+    std::sort(Ctmp.ptr()+cStart, Ctmp.ptr()+cStart+c,
+              [](const nonZero_t& a, const nonZero_t& b) {
+                return a.col < b.col;
+              });
+
+    /*Count how many actual nonzeros will be in this row*/
+    dlong nnzRow=0;
+    if (c>0) nnzRow++;
+    for (dlong j=1;j<c;j++) {
+      if ((Ctmp[cStart+j].col!=Ctmp[cStart+j-1].col)) nnzRow++;
+    }
+
+    nnz+=nnzRow; //Add to total
+  }
+  BoffdRowOffsets.free();
+  BoffdRows.free();
+
+  rowStarts.free();
+  rowCounts.free();
+
+  // cooC.nnz = nnz;
+  memory<nonZero_t> entries(nnz);
+
+  //compress nonzeros
+  nnz = 0;
+  if (NNZ) entries[nnz++] = Ctmp[0];
+  for (dlong i=1;i<NNZ;i++) {
+    if ((Ctmp[i].row!=Ctmp[i-1].row)||
+        (Ctmp[i].col!=Ctmp[i-1].col)) {
+      entries[nnz++] = Ctmp[i];
+    } else {
+      entries[nnz-1].val += Ctmp[i].val;
+    }
+  }
+  //clean up
+  Ctmp.free();
+
+  //build C from coo matrix
+  return parCSR(A.Nrows, B.NlocalCols,
+                nnz, entries,
+                A.platform, A.comm);
+}
+
+} //namespace paradogs
+
+} //namespace libp
diff --git a/libs/parAdogs/parAdogsMultigridTentativeProlongator.cpp b/libs/parAdogs/parAdogsMultigridTentativeProlongator.cpp
new file mode 100644
index 000000000..fc195b5d2
--- /dev/null
+++ b/libs/parAdogs/parAdogsMultigridTentativeProlongator.cpp
@@ -0,0 +1,101 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsGraph.hpp"
+
+namespace libp {
+
+namespace paradogs {
+
+parCSR TentativeProlongator(const dlong Nf,
+                            const dlong Nc,
+                            platform_t& platform,
+                            comm_t comm,
+                            memory<hlong>& FineToCoarse,
+                            memory<dfloat>& FineNull,
+                            memory<dfloat>& CoarseNull) {
+  dlong nnz = Nf;
+  memory<nonZero_t> entries(nnz);
+
+  hlong localNf=static_cast<hlong>(Nf);
+  hlong NfOffsetL=0, NfOffsetU=0;
+  comm.Scan(localNf, NfOffsetU);
+  NfOffsetL = NfOffsetU-Nf;
+
+  /* Each entry is the CoarseNull vector entry*/
+  #pragma omp parallel for
+  for (dlong n=0;n<Nf;++n) {
+    entries[n].row = n + NfOffsetL;
+    entries[n].col = FineToCoarse[n];
+    entries[n].val = FineNull[n];
+  }
+
+  parCSR T(Nf, Nc,
+           nnz, entries,
+           platform, comm);
+  entries.free();
+
+  /*Create coarse nullvector*/
+  CoarseNull.malloc(T.Ncols);
+
+  /*Init coarse null*/
+  #pragma omp parallel for
+  for (dlong v=0;v<T.Ncols;++v) CoarseNull[v] = 0.0;
+
+  /*Sum columns of T*/
+  //add local nonzeros
+  for (dlong n=0;n<T.diag.nnz;++n)
+    CoarseNull[T.diag.cols[n]] += T.diag.vals[n] * T.diag.vals[n];
+
+  //add nonlocal nonzeros
+  for(dlong n=0; n<T.offd.nnz;++n)
+    CoarseNull[T.offd.cols[n]] += T.offd.vals[n] * T.offd.vals[n];
+
+  //add the halo values to their origins
+  T.halo.Combine(CoarseNull, 1);
+
+  #pragma omp parallel for
+  for (dlong n=0;n<Nc;++n)
+    CoarseNull[n] = sqrt(CoarseNull[n]);
+
+  //share the results
+  T.halo.Exchange(CoarseNull, 1);
+
+  #pragma omp parallel for
+  for (dlong n=0;n<T.diag.nnz;++n)
+    T.diag.vals[n] /= CoarseNull[T.diag.cols[n]];
+
+  #pragma omp parallel for
+  for (dlong n=0;n<T.offd.nnz;++n)
+    T.offd.vals[n] /= CoarseNull[T.offd.cols[n]];
+
+  return T;
+}
+
+} //namespace paradogs
+
+} //namespace libp
diff --git a/libs/parAdogs/parAdogsMultigridTranspose.cpp b/libs/parAdogs/parAdogsMultigridTranspose.cpp
new file mode 100644
index 000000000..614f26c29
--- /dev/null
+++ b/libs/parAdogs/parAdogsMultigridTranspose.cpp
@@ -0,0 +1,177 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsGraph.hpp"
+
+#ifdef GLIBCXX_PARALLEL
+#include <parallel/algorithm>
+using __gnu_parallel::sort;
+#else
+using std::sort;
+#endif
+
+namespace libp {
+
+namespace paradogs {
+
+parCSR Transpose(const parCSR& A) {
+
+  // MPI info
+  int size = A.comm.size();
+
+  // copy data from nonlocal entries into send buffer
+  memory<nonZero_t> sendNonZeros(A.offd.nnz);
+  for(dlong i=0;i<A.offd.nzRows;++i){
+    const hlong row = A.offd.rows[i] + A.rowOffsetL; //global ids
+    for (dlong j=A.offd.mRowStarts[i];j<A.offd.mRowStarts[i+1];j++) {
+      const hlong col =  A.colMap[A.offd.cols[j]]; //global ids
+      sendNonZeros[j].row = col;
+      sendNonZeros[j].col = row;
+      sendNonZeros[j].val = A.offd.vals[j];
+    }
+  }
+
+  //sort by destination row
+  std::sort(sendNonZeros.ptr(), sendNonZeros.ptr()+A.offd.nnz,
+            [](const nonZero_t& a, const nonZero_t& b) {
+              if (a.row < b.row) return true;
+              if (a.row > b.row) return false;
+
+              return a.col < b.col;
+            });
+
+  // //count number of non-zeros we're sending
+  memory<int> sendCounts(size, 0);
+  memory<int> recvCounts(size);
+  memory<int> sendOffsets(size+1);
+  memory<int> recvOffsets(size+1);
+
+  memory<hlong> globalColStarts(size+1);
+  globalColStarts[0]=0;
+  A.comm.Allgather(A.colOffsetU, globalColStarts+1);
+
+  int r=0;
+  for (dlong n=0;n<A.offd.nnz;n++) {
+    dlong row = sendNonZeros[n].row;
+    while(row>=globalColStarts[r+1]) r++;
+    sendCounts[r]++;
+  }
+  globalColStarts.free();
+
+  A.comm.Alltoall(sendCounts, recvCounts);
+
+  sendOffsets[0]=0;
+  recvOffsets[0]=0;
+  for (r=0;r<size;r++) {
+    sendOffsets[r+1] = sendOffsets[r]+sendCounts[r];
+    recvOffsets[r+1] = recvOffsets[r]+recvCounts[r];
+  }
+  dlong offdnnz = recvOffsets[size]; //total offd nonzeros
+
+  memory<nonZero_t> offdNonZeros(offdnnz);
+
+  // receive non-local nonzeros
+  A.comm.Alltoallv(sendNonZeros, sendCounts, sendOffsets,
+                   offdNonZeros, recvCounts, recvOffsets);
+
+  //clean up
+  sendNonZeros.free();
+  sendCounts.free();
+  recvCounts.free();
+  sendOffsets.free();
+  recvOffsets.free();
+
+  dlong NNZ = A.diag.nnz+offdnnz;
+
+  memory<nonZero_t> entries(NNZ);
+
+  memory<dlong> rowStarts(A.NlocalCols+1, 0);
+  memory<dlong> rowCounts(A.NlocalCols, 0);
+
+  /*Count entries per row*/
+  for(dlong i=0; i<A.Nrows; i++) {
+    const dlong Jstart = A.diag.rowStarts[i];
+    const dlong Jend   = A.diag.rowStarts[i+1];
+
+    for(dlong jj=Jstart; jj<Jend; jj++){
+      rowStarts[A.diag.cols[jj]+1]++;
+    }
+  }
+  for(dlong i=0; i<offdnnz; i++) {
+    const dlong row = static_cast<dlong>(offdNonZeros[i].row-A.colOffsetL);
+    rowStarts[row+1]++;
+  }
+
+  /*Cumulative sum*/
+  for(dlong i=1; i<A.NlocalCols+1; i++) {
+    rowStarts[i] += rowStarts[i-1];
+  }
+
+  //fill local nonzeros
+  // #pragma omp parallel for
+  for(dlong i=0; i<A.Nrows; i++){
+    const dlong Jstart = A.diag.rowStarts[i];
+    const dlong Jend   = A.diag.rowStarts[i+1];
+
+    for(dlong jj=Jstart; jj<Jend; jj++){
+      const dlong row = A.diag.cols[jj];
+      const dlong c = rowStarts[row] + rowCounts[row];
+
+      entries[c].row = row + A.colOffsetL;
+      entries[c].col = i + A.rowOffsetL;
+      entries[c].val = A.diag.vals[jj];
+      rowCounts[row]++;
+    }
+  }
+  for(dlong i=0; i<offdnnz; i++) {
+    const dlong row = static_cast<dlong>(offdNonZeros[i].row-A.colOffsetL);
+    const dlong c = rowStarts[row] + rowCounts[row];
+    entries[c] = offdNonZeros[i];
+    rowCounts[row]++;
+  }
+
+  offdNonZeros.free();
+
+  //sort each row by column id
+  #pragma omp parallel for
+  for(dlong i=0; i<A.NlocalCols; i++){
+    const dlong Nentries = rowStarts[i+1]-rowStarts[i];
+    const dlong c = rowStarts[i];
+    std::sort(entries.ptr()+c, entries.ptr()+c+Nentries,
+          [](const nonZero_t& a, const nonZero_t& b) {
+            return a.col < b.col;
+          });
+  }
+
+  return parCSR(A.NlocalCols, A.Nrows,
+                NNZ, entries,
+                A.platform, A.comm);
+}
+
+} //namespace paradogs
+
+} //namespace libp
diff --git a/libs/parAdogs/parAdogsParallelPivot.cpp b/libs/parAdogs/parAdogsParallelPivot.cpp
new file mode 100644
index 000000000..9e962871a
--- /dev/null
+++ b/libs/parAdogs/parAdogsParallelPivot.cpp
@@ -0,0 +1,105 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsGraph.hpp"
+#include "parAdogs/parAdogsPartition.hpp"
+#include <algorithm>
+#include <limits>
+
+#ifdef GLIBCXX_PARALLEL
+#include <parallel/algorithm>
+using __gnu_parallel::partition;
+#else
+using std::partition;
+#endif
+
+namespace libp {
+
+namespace paradogs {
+
+static dfloat Pivot(memory<dfloat>& A,
+                    const dlong left,
+                    const dlong right,
+                    const hlong k,
+                    const dfloat min,
+                    const dfloat max,
+                    comm_t comm) {
+  /*Start with guessing a pivot halfway between min and max*/
+  const dfloat pivot = (min+max)/2.0;
+
+  /*Bail out if we're looking at a tiny window*/
+  constexpr dfloat TOL = (sizeof(dfloat)==8) ? 1.0e-13 : 1.0E-5;
+  if (max-min < TOL) return pivot;
+
+  dfloat* Am = partition(A.ptr()+left, A.ptr()+right, [pivot](const dfloat& a){ return a <= pivot; });
+
+  /*Get how many entries are globally <= pivot*/
+  hlong localCnt = Am-A.ptr();
+  hlong globalCnt = localCnt;
+  comm.Allreduce(globalCnt);
+
+  if (globalCnt==k) return pivot;
+
+  if (k<globalCnt) {
+    return Pivot(A, left, localCnt, k, min, pivot, comm);
+  } else {
+    return Pivot(A, localCnt, right, k, pivot, max, comm);
+  }
+}
+
+/* Given a distributed vector F in comm, find a pivot value,
+   such that there are globally k entries of F which are <= pivot. */
+dfloat ParallelPivot(const dlong N, memory<dfloat>& F,
+                     const hlong k, comm_t comm) {
+
+  /*Make a copy of input vector*/
+  memory<dfloat> A(N);
+  
+  #pragma omp parallel for
+  for (dlong n=0;n<N;++n) {
+    A[n] = F[n];
+  }
+
+  /*Find global minimum/maximum*/
+  dfloat globalMin=std::numeric_limits<dfloat>::max();
+  dfloat globalMax=std::numeric_limits<dfloat>::min();
+  for (dlong n=0;n<N;++n) {
+    globalMax = std::max(A[n], globalMax);
+    globalMin = std::min(A[n], globalMin);
+  }
+  comm.Allreduce(globalMin, Comm::Min);
+  comm.Allreduce(globalMax, Comm::Max);
+
+  /*Find pivot point via binary search*/
+  dfloat pivot = Pivot(A, 0, N, k, globalMin, globalMax, comm);
+
+  return pivot;
+}
+
+} //namespace paradogs
+
+} //namespace libp
diff --git a/libs/parAdogs/parAdogsRefine.cpp b/libs/parAdogs/parAdogsRefine.cpp
new file mode 100644
index 000000000..2783351c4
--- /dev/null
+++ b/libs/parAdogs/parAdogsRefine.cpp
@@ -0,0 +1,141 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsGraph.hpp"
+#include "parAdogs/parAdogsPartition.hpp"
+
+namespace libp {
+
+namespace paradogs {
+
+/****************************************/
+/* Refine Fiedler Vector               */
+/****************************************/
+void graph_t::Refine(const int level) {
+
+  parCSR& A = L[level].A;
+  memory<dfloat>& null = L[level].null;
+  const dlong N = L[level].Nrows;
+  const dlong Ncols = L[level].Ncols;
+
+  memory<dfloat>& Fiedler = L[level].Fiedler;
+
+  /*******************************************************/
+  /*Improve fine Fiedler vector via Inverse Iteration    */
+  /*******************************************************/
+
+  const dfloat RELTOL = 3.0e-1;
+  const dfloat CG_TOL = 1.0e-2;
+
+  const int maxIters=1;
+
+  memory<dfloat> x(Ncols);
+  memory<dfloat> scratch(3*Ncols);
+  memory<dfloat> AF = scratch;
+
+  /*AF = A*F*/
+  A.SpMV(1.0, Fiedler, 0.0, AF);
+
+  /*theta = F^T * A * F */
+  dfloat theta = 0.0;
+  dfloat normAF = 0.0;
+  for (dlong n=0;n<N;++n) {
+    theta += Fiedler[n]*AF[n];
+    normAF += AF[n]*AF[n];
+  }
+  comm.Allreduce(theta);
+  comm.Allreduce(normAF);
+
+  dfloat err = sqrt(std::abs(normAF - theta*theta))/theta;
+
+  // if (rank==0) printf("Intial err = %f, theta = %f, ||AF|| = %f \n", err, theta, sqrt(normAF));
+
+  for (int it=0;it<maxIters;++it) {
+
+    if (err<RELTOL) break;
+
+    #pragma omp parallel for
+    for (dlong n=0;n<N;++n) {
+      x[n] = Fiedler[n]/theta;
+    }
+
+    #pragma omp parallel for
+    for (dlong n=0;n<N;++n) {
+      Fiedler[n] = Fiedler[n] - AF[n]/theta;
+    }
+
+    /*Solve A_{l}*x = Fiedler*/
+    (void) Solve(level, CG_TOL, Fiedler, x, scratch);
+    // const int cg_iter = Solve(level, CG_TOL, Fiedler, x, scratch);
+
+    /*Project out null vector*/
+    dfloat dot=0.0;
+    for (int n=0;n<N;++n) {
+      dot += x[n]*null[n];
+    }
+    comm.Allreduce(dot);
+
+    #pragma omp parallel for
+    for (int n=0;n<N;++n) {
+      x[n] -= dot*null[n];
+    }
+
+    dfloat normx = 0.0;
+    for (dlong n=0;n<N;++n) {
+      normx += x[n]*x[n];
+    }
+    comm.Allreduce(normx);
+    normx = sqrt(normx);
+
+    /*F = x /||x||*/
+    #pragma omp parallel for
+    for (dlong n=0;n<N;++n) {
+      Fiedler[n] = x[n]/normx;
+    }
+
+    /*AF = A*F*/
+    A.SpMV(1.0, Fiedler, 0.0, AF);
+
+    /*theta = F^T * A * F */
+    theta = 0.0;
+    normAF = 0.0;
+    for (dlong n=0;n<N;++n) {
+      theta += Fiedler[n]*AF[n];
+      normAF += AF[n]*AF[n];
+    }
+    comm.Allreduce(theta);
+    comm.Allreduce(normAF);
+
+    err = sqrt(std::abs(normAF - theta*theta))/theta;
+
+    // if (rank==0)  printf("err = %f, theta = %f, ||AF|| = %f, cg_iter=%d\n", err, theta, sqrt(normAF), cg_iter);
+  }
+}
+
+} //namespace paradogs
+
+} //namespace libp
diff --git a/libs/mesh/meshOccaSetupQuad3D.cpp b/libs/parAdogs/parAdogsSettings.cpp
similarity index 65%
rename from libs/mesh/meshOccaSetupQuad3D.cpp
rename to libs/parAdogs/parAdogsSettings.cpp
index dc6094089..ffd2ea232 100644
--- a/libs/mesh/meshOccaSetupQuad3D.cpp
+++ b/libs/parAdogs/parAdogsSettings.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -24,21 +24,25 @@ SOFTWARE.
 
 */
 
-#include "mesh.hpp"
-#include "mesh/mesh3D.hpp"
+#include "parAdogs.hpp"
 
-void meshQuad3D::OccaSetup(){
+namespace libp {
 
-  this->mesh3D::OccaSetup();
+namespace paradogs {
 
-  o_D = platform.malloc(Nq*Nq*sizeof(dfloat), D);
+void AddSettings(settings_t& settings) {
 
-  o_S    = o_D; //dummy
-  o_MM   = o_D; //dummy
-  o_sM   = o_D; //dummy
-  o_LIFT = o_D; //dummy
+  settings.newSetting("PARADOGS PARTITIONING",
+                      "INERTIAL",
+                      "Type of Mesh partitioning",
+                      {"NONE", "INERTIAL", "SPECTRAL"});
+}
+
+void ReportSettings(settings_t& settings) {
 
-  o_vgeo = platform.malloc((Nelements+totalHaloPairs)*Nvgeo*Np*sizeof(dfloat), vgeo);
-  o_sgeo = platform.malloc(Nelements*Nfaces*Nfp*Nsgeo*sizeof(dfloat), sgeo);
-  o_ggeo = platform.malloc(Nelements*Np*Nggeo*sizeof(dfloat), ggeo);
+  settings.reportSetting("PARADOGS PARTITIONING");
 }
+
+} //namespace paradogs
+
+} //namespace libp
diff --git a/libs/parAdogs/parAdogsSolve.cpp b/libs/parAdogs/parAdogsSolve.cpp
new file mode 100644
index 000000000..ff848c0bc
--- /dev/null
+++ b/libs/parAdogs/parAdogsSolve.cpp
@@ -0,0 +1,131 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsGraph.hpp"
+#include "parAdogs/parAdogsPartition.hpp"
+
+namespace libp {
+
+namespace paradogs {
+
+/****************************************/
+/* Solve A_{l}*x = b                    */
+/****************************************/
+int graph_t::Solve(const int level, 
+                   const dfloat TOL,
+                   memory<dfloat>& r,
+                   memory<dfloat>& x,
+                   memory<dfloat>& scratch) {
+
+  parCSR& A = L[level].A;
+  const dlong N = A.Nrows;
+  const dlong Ncols = L[level].Ncols;
+
+  memory<dfloat> p  = scratch + 0*Ncols;
+  memory<dfloat> Ap = scratch + 1*Ncols;
+  memory<dfloat> z  = scratch + 2*Ncols;
+
+  // register scalars
+  dfloat rdotz1 = 0.0;
+  dfloat rdotz2 = 0.0;
+  dfloat alpha = 0.0, beta = 0.0, pAp = 0.0;
+  dfloat rdotr = 1.0;
+  const int MAXIT = 5000;
+
+  /* We assume that x is initialized to some guess and
+     r = b-A*x */
+
+  /*Compute x = A^{-1} b*/
+  int cg_iter;
+  for(cg_iter=0;cg_iter<MAXIT;++cg_iter){
+
+    // Exit if tolerance is reached, taking at least one step.
+    if (((cg_iter == 0) && (rdotr == 0.0)) ||
+        ((cg_iter > 0) && (sqrt(rdotr) <= TOL))) {
+      break;
+    }
+
+    // z = Precon^{-1} r
+    MultigridVcycle(level, r, z);
+
+    // r.z
+    rdotz2 = rdotz1;
+    rdotz1 = 0.0;
+    for (dlong n=0;n<N;++n) {
+      rdotz1 += z[n]*r[n];
+    }
+    comm.Allreduce(rdotz1);
+
+    beta = (cg_iter==0) ? 0.0 : rdotz1/rdotz2;
+
+    // p = z + beta*p
+    if (cg_iter==0) {
+      #pragma omp parallel for
+      for (dlong n=0;n<N;++n) {
+        p[n] = z[n];
+      }
+    } else {
+      #pragma omp parallel for
+      for (dlong n=0;n<N;++n) {
+        p[n] = z[n] + beta*p[n];
+      }
+    }
+
+    // A*p
+    A.SpMV(1.0, p, 0.0, Ap);
+
+    // p.Ap
+    pAp = 0.0;
+    for (dlong n=0;n<N;++n) {
+      pAp += p[n]*Ap[n];
+    }
+    comm.Allreduce(pAp);
+
+    alpha = rdotz1/pAp;
+
+    //  x <= x + alpha*p
+    //  r <= r - alpha*A*p
+    //  dot(r,r)
+    rdotr = 0.0;
+    for (dlong n=0;n<N;++n) {
+      x[n] = x[n] + alpha*p[n];
+      r[n] = r[n] - alpha*Ap[n];
+      rdotr += r[n]*r[n];
+    }
+    comm.Allreduce(rdotr);
+
+    if(rdotr<0) printf("WARNING CG: rdotr = %17.15lf\n", rdotr);
+
+    // printf("CG: it %d, r norm %12.12le, alpha = %le \n", cg_iter+1, sqrt(rdotr), alpha);
+  }
+
+  return cg_iter;
+}
+
+} //namespace paradogs
+
+} //namespace libp
diff --git a/libs/parAdogs/parAdogsSpectralBipartition.cpp b/libs/parAdogs/parAdogsSpectralBipartition.cpp
new file mode 100644
index 000000000..6abaaafff
--- /dev/null
+++ b/libs/parAdogs/parAdogsSpectralBipartition.cpp
@@ -0,0 +1,73 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsGraph.hpp"
+#include "parAdogs/parAdogsPartition.hpp"
+
+namespace libp {
+
+namespace paradogs {
+
+/****************************************/
+/* Multilevel Spectral Bipartition      */
+/****************************************/
+void graph_t::SpectralBipartition(const dfloat targetFraction[2]) {
+
+  /*Create multilevel heirarchy*/
+  MultigridSetup();
+
+  /*Compute Fiedler vector */
+  memory<dfloat>& Fiedler = FiedlerVector();
+
+  /*Use Fiedler vector to bipartion graph*/
+  const hlong K = std::ceil(targetFraction[0]*NVertsGlobal);
+  const dfloat pivot = ParallelPivot(Nverts, Fiedler, K, comm);
+
+  memory<int> partition(L[0].A.Ncols);
+
+  for (dlong n=0;n<Nverts;++n) {
+    if (Fiedler[n]<=pivot) {
+      partition[n] = 0;
+    } else {
+      partition[n] = 1;
+    }
+  }
+
+  /*Fill halo region of partition vector*/
+  L[0].A.halo.Exchange(partition, 1);
+
+  /*Split the graph according to this partitioning*/
+  Split(partition);
+
+  /*Clear the coarse levels*/
+  MultigridDestroy();
+}
+
+} //namespace paradogs
+
+} //namespace libp
+
diff --git a/libs/parAdogs/parAdogsSpectralPartition.cpp b/libs/parAdogs/parAdogsSpectralPartition.cpp
new file mode 100644
index 000000000..065c1e2fc
--- /dev/null
+++ b/libs/parAdogs/parAdogsSpectralPartition.cpp
@@ -0,0 +1,61 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAdogs.hpp"
+#include "parAdogs/parAdogsGraph.hpp"
+#include "parAdogs/parAdogsPartition.hpp"
+
+namespace libp {
+
+namespace paradogs {
+
+/*************************************************/
+/* k-Way Recusive Spectral Partitioning          */
+/*************************************************/
+void graph_t::SpectralPartition() {
+
+  if (size==1) return;
+
+  /*Determine size of left and right partitions*/
+  const int size0 = (size+1)/2;
+  // const int size1 = size-size0;
+
+  /*Set target */
+  dfloat bipartitionFraction[2] = {0.0, 0.0};
+  bipartitionFraction[0] = static_cast<dfloat>(size0)/size;
+  bipartitionFraction[1] = 1.0 - bipartitionFraction[0];
+
+  /*Bipartition and redistribute, update size*/
+  SpectralBipartition(bipartitionFraction);
+
+  /*Recursive call*/
+  SpectralPartition();
+}
+
+} //namespace paradogs
+
+} //namespace libp
+
diff --git a/libs/parAlmond/okl/SmoothChebyshev.okl b/libs/parAlmond/okl/SmoothChebyshev.okl
index 583e062a0..d511ff6cd 100644
--- a/libs/parAlmond/okl/SmoothChebyshev.okl
+++ b/libs/parAlmond/okl/SmoothChebyshev.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -164,4 +164,4 @@ SOFTWARE.
 
     x[n] += d_kp1;
   }
-}
\ No newline at end of file
+}
diff --git a/libs/parAlmond/okl/SmoothJacobi.okl b/libs/parAlmond/okl/SmoothJacobi.okl
index b3c0c21d5..5f3aa6107 100644
--- a/libs/parAlmond/okl/SmoothJacobi.okl
+++ b/libs/parAlmond/okl/SmoothJacobi.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -109,4 +109,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/libs/parAlmond/okl/SpMVcsr.okl b/libs/parAlmond/okl/SpMVcsr.okl
index 17ef01945..622347ffe 100644
--- a/libs/parAlmond/okl/SpMVcsr.okl
+++ b/libs/parAlmond/okl/SpMVcsr.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/libs/parAlmond/okl/SpMVmcsr.okl b/libs/parAlmond/okl/SpMVmcsr.okl
index 1dff7dd36..b6ec7e84a 100644
--- a/libs/parAlmond/okl/SpMVmcsr.okl
+++ b/libs/parAlmond/okl/SpMVmcsr.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/libs/parAlmond/okl/dGEMV.okl b/libs/parAlmond/okl/dGEMV.okl
index 971fd39b5..66a54c581 100644
--- a/libs/parAlmond/okl/dGEMV.okl
+++ b/libs/parAlmond/okl/dGEMV.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/libs/parAlmond/okl/kcycleCombinedOp.okl b/libs/parAlmond/okl/kcycleCombinedOp.okl
index 1ad1dd415..fddef262b 100644
--- a/libs/parAlmond/okl/kcycleCombinedOp.okl
+++ b/libs/parAlmond/okl/kcycleCombinedOp.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -35,7 +35,7 @@ SOFTWARE.
 
   for(dlong n=0;n<Nblocks;++n;@outer(0)){
 
-    @shared volatile dfloat s_ip[3*p_BLOCKSIZE];
+    @shared dfloat s_ip[3*p_BLOCKSIZE];
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)){
       dlong id = t + n*p_BLOCKSIZE;
@@ -53,7 +53,6 @@ SOFTWARE.
         id += p_BLOCKSIZE*Nblocks;
       }
     }
-    @barrier("local");
 
 #if p_BLOCKSIZE>512
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
@@ -63,7 +62,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+512];
       }
     }
-    @barrier("local");
 #endif
 
 #if p_BLOCKSIZE>256
@@ -74,7 +72,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+256];
       }
     }
-    @barrier("local");
 #endif
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
@@ -84,7 +81,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+128];
       }
     }
-    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<64) {
@@ -93,7 +89,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+64];
       }
     }
-    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<32) {
@@ -102,7 +97,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+32];
       }
     }
-    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<16) {
@@ -111,7 +105,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+16];
       }
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<8) {
@@ -120,7 +113,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+8];
       }
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<4) {
@@ -129,7 +121,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+4];
       }
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<2) {
@@ -138,7 +129,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+2];
       }
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<1) {
@@ -180,7 +170,6 @@ SOFTWARE.
         id += p_BLOCKSIZE*Nblocks;
       }
     }
-    @barrier("local");
 
     #if p_BLOCKSIZE>512
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
@@ -190,7 +179,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+512];
       }
     }
-    @barrier("local");
 #endif
 
 #if p_BLOCKSIZE>256
@@ -201,7 +189,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+256];
       }
     }
-    @barrier("local");
 #endif
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
@@ -211,7 +198,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+128];
       }
     }
-    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<64) {
@@ -220,7 +206,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+64];
       }
     }
-    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<32) {
@@ -229,7 +214,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+32];
       }
     }
-    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<16) {
@@ -238,7 +222,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+16];
       }
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<8) {
@@ -247,7 +230,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+8];
       }
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<4) {
@@ -256,7 +238,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+4];
       }
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<2) {
@@ -265,7 +246,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+2];
       }
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<1) {
@@ -308,7 +288,6 @@ SOFTWARE.
         id += p_BLOCKSIZE*Nblocks;
       }
     }
-    @barrier("local");
 
 #if p_BLOCKSIZE>512
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
@@ -318,7 +297,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+512];
       }
     }
-    @barrier("local");
 #endif
 
 #if p_BLOCKSIZE>256
@@ -329,7 +307,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+256];
       }
     }
-    @barrier("local");
 #endif
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
@@ -339,7 +316,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+128];
       }
     }
-    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<64) {
@@ -348,7 +324,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+64];
       }
     }
-    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<32) {
@@ -357,7 +332,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+32];
       }
     }
-    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<16) {
@@ -366,7 +340,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+16];
       }
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<8) {
@@ -375,7 +348,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+8];
       }
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<4) {
@@ -384,7 +356,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+4];
       }
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<2) {
@@ -393,7 +364,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+2];
       }
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<1) {
@@ -437,7 +407,6 @@ SOFTWARE.
         id += p_BLOCKSIZE*Nblocks;
       }
     }
-    @barrier("local");
 
     #if p_BLOCKSIZE>512
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
@@ -447,7 +416,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+512];
       }
     }
-    @barrier("local");
 #endif
 
 #if p_BLOCKSIZE>256
@@ -458,7 +426,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+256];
       }
     }
-    @barrier("local");
 #endif
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
@@ -468,7 +435,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+128];
       }
     }
-    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<64) {
@@ -477,7 +443,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+64];
       }
     }
-    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<32) {
@@ -486,7 +451,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+32];
       }
     }
-    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<16) {
@@ -495,7 +459,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+16];
       }
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<8) {
@@ -504,7 +467,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+8];
       }
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<4) {
@@ -513,7 +475,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+4];
       }
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<2) {
@@ -522,7 +483,6 @@ SOFTWARE.
         s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+2];
       }
     }
-    //    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
       if(t<1) {
diff --git a/libs/parAlmond/okl/vectorAddInnerProd.okl b/libs/parAlmond/okl/vectorAddInnerProd.okl
index 4ae7b72e3..eb3e20d65 100644
--- a/libs/parAlmond/okl/vectorAddInnerProd.okl
+++ b/libs/parAlmond/okl/vectorAddInnerProd.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -37,7 +37,7 @@ SOFTWARE.
 
   for(dlong b=0;b<Nblocks;++b;@outer(0)){
 
-    @shared volatile dfloat s_ip[p_BLOCKSIZE];
+    @shared dfloat s_ip[p_BLOCKSIZE];
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)){
       dlong id = t + b*p_BLOCKSIZE;
@@ -56,38 +56,28 @@ SOFTWARE.
         id += p_BLOCKSIZE*Nblocks;
       }
     }
-    @barrier("local");
 
 #if p_BLOCKSIZE>512
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<512) s_ip[t] += s_ip[t+512];
-    @barrier("local");
 #endif
 
 #if p_BLOCKSIZE>256
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<256) s_ip[t] += s_ip[t+256];
-    @barrier("local");
 #endif
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<128) s_ip[t] += s_ip[t+128];
-    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t< 64) s_ip[t] += s_ip[t+ 64];
-    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t< 32) s_ip[t] += s_ip[t+ 32];
-    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t< 16) s_ip[t] += s_ip[t+ 16];
-    //    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<  8) s_ip[t] += s_ip[t+  8];
-    //    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<  4) s_ip[t] += s_ip[t+  4];
-    //    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<  2) s_ip[t] += s_ip[t+  2];
-    //    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<  1) ip[b] = s_ip[0] + s_ip[1];
   }
@@ -125,38 +115,28 @@ SOFTWARE.
         id += p_BLOCKSIZE*Nblocks;
       }
     }
-    @barrier("local");
 
 #if p_BLOCKSIZE>512
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<512) s_ip[t] += s_ip[t+512];
-    @barrier("local");
 #endif
 
 #if p_BLOCKSIZE>256
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<256) s_ip[t] += s_ip[t+256];
-    @barrier("local");
 #endif
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<128) s_ip[t] += s_ip[t+128];
-    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t< 64) s_ip[t] += s_ip[t+ 64];
-    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t< 32) s_ip[t] += s_ip[t+ 32];
-    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t< 16) s_ip[t] += s_ip[t+ 16];
-    //    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<  8) s_ip[t] += s_ip[t+  8];
-    //    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<  4) s_ip[t] += s_ip[t+  4];
-    //    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<  2) s_ip[t] += s_ip[t+  2];
-    //    @barrier("local");
 
     for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<  1) ip[b] = s_ip[0] + s_ip[1];
   }
diff --git a/libs/parAlmond/parAlmond.cpp b/libs/parAlmond/parAlmond.cpp
index 7c5d57bcf..3c0b7f9fd 100644
--- a/libs/parAlmond/parAlmond.cpp
+++ b/libs/parAlmond/parAlmond.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,51 +25,46 @@ SOFTWARE.
 */
 
 #include "parAlmond.hpp"
-#include "parAlmond/parAlmondMultigrid.hpp"
 #include "parAlmond/parAlmondKernels.hpp"
+#include "parAlmond/parAlmondCoarseSolver.hpp"
+
+namespace libp {
 
 namespace parAlmond {
 
-parAlmond_t::parAlmond_t(platform_t& _platform, settings_t& _settings, MPI_Comm comm):
-  platform(_platform), settings(_settings) {
+void parAlmond_t::Setup(platform_t& _platform, settings_t& _settings, comm_t comm) {
+
+  platform = _platform;
+  settings = _settings;
 
-  platform.linAlg.InitKernels({"set", "add", "sum", "scale",
+  platform.linAlg().InitKernels({"set", "add", "sum", "scale",
                                 "axpy", "zaxpy",
                                 "amx", "amxpy", "zamxpy",
                                 "adx", "adxpy", "zadxpy",
                                 "innerProd", "norm2"});
 
-  multigrid = new multigrid_t(platform, settings, comm);
+  multigrid = std::make_shared<multigrid_t>(platform, settings, comm);
 
   //build parAlmond kernels on first construction
-  if (Nrefs==0) buildParAlmondKernels(platform);
-  Nrefs++;
+  buildParAlmondKernels(platform);
 }
 
-void parAlmond_t::Operator(occa::memory& o_rhs, occa::memory& o_x) {
+void parAlmond_t::Operator(deviceMemory<dfloat>& o_rhs, deviceMemory<dfloat>& o_x) {
 
   if (multigrid->exact){ //call the linear solver
     int maxIter = 500;
     int verbose = settings.compareSetting("VERBOSE", "TRUE") ? 1 : 0;
     dfloat tol = 1e-8;
-    solver_t &A = *(multigrid->levels[0]);
-    (void) multigrid->linearSolver->Solve(A, *multigrid, o_x, o_rhs, tol, maxIter, verbose);
+    solver_t &A = multigrid->GetLevel<solver_t>(0);
+    (void) multigrid->linearSolver.Solve(A, *multigrid, o_x, o_rhs, tol, maxIter, verbose);
   } else { //apply a multigrid cycle
     multigrid->Operator(o_rhs, o_x);
   }
 }
 
-//Add level to multigrid heirarchy
-void parAlmond_t::AddLevel(multigridLevel* level) {
-  multigrid->AddLevel(level);
-}
-
 void parAlmond_t::Report() {
 
-  int rank;
-  MPI_Comm_rank(multigrid->comm, &rank);
-
-  if(rank==0) {
+  if(multigrid->comm.rank()==0) {
     printf("-----------------------------Multigrid Report-----------------------------------------------\n");
     printf("--------------------------------------------------------------------------------------------\n");
     printf("Level |    Type    |    Dimension   |  Per Rank Dim  |   nnz per row   |   Smoother        |\n");
@@ -78,17 +73,21 @@ void parAlmond_t::Report() {
   }
 
   for(int lev=0; lev<multigrid->numLevels-1; lev++) {
-    if(rank==0) {printf(" %3d  ", lev);fflush(stdout);}
+    if(multigrid->comm.rank()==0) {printf(" %3d  ", lev);fflush(stdout);}
     multigrid->levels[lev]->Report();
   }
 
   //base level
   multigrid->coarseSolver->Report(multigrid->numLevels-1);
 
-  if(rank==0)
+  if(multigrid->comm.rank()==0)
     printf("--------------------------------------------------------------------------------------------\n");
 }
 
+int parAlmond_t::NumLevels() {
+  return multigrid->numLevels;
+}
+
 dlong parAlmond_t::getNumCols(int k) {
   return multigrid->levels[k]->Ncols;
 }
@@ -97,11 +96,6 @@ dlong parAlmond_t::getNumRows(int k) {
   return multigrid->levels[k]->Nrows;
 }
 
-parAlmond_t::~parAlmond_t() {
-  Nrefs--;
-  if (Nrefs==0) freeParAlmondKernels();
-
-  delete multigrid;
-}
-
 } //namespace parAlmond
+
+} //namespace libp
diff --git a/libs/parAlmond/parAlmondAMGLevel.cpp b/libs/parAlmond/parAlmondAMGLevel.cpp
index 412d99cb7..ee8eb17fe 100644
--- a/libs/parAlmond/parAlmondAMGLevel.cpp
+++ b/libs/parAlmond/parAlmondAMGLevel.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,14 +25,16 @@ SOFTWARE.
 */
 
 #include "parAlmond.hpp"
-#include "parAlmond/parAlmondMultigrid.hpp"
 #include "parAlmond/parAlmondAMGLevel.hpp"
 
+namespace libp {
+
 namespace parAlmond {
 
-amgLevel::amgLevel(parCSR *_A, settings_t& _settings):
-  multigridLevel(_A->Nrows, _A->Ncols, _A->platform, _settings),
-  A(_A) {
+amgLevel::amgLevel(parCSR& _A, settings_t& _settings):
+  multigridLevel(_A.Nrows, _A.Ncols, _A.platform, _settings, _A.comm) {
+
+  A = _A;
 
   //determine smoother
   if (settings.compareSetting("PARALMOND SMOOTHER", "CHEBYSHEV")) {
@@ -43,35 +45,29 @@ amgLevel::amgLevel(parCSR *_A, settings_t& _settings):
   }
 }
 
-amgLevel::~amgLevel() {
-  if (  A) delete   A;
-  if (  P) delete   P;
-  if (  R) delete   R;
+void amgLevel::Operator(deviceMemory<dfloat>& o_X, deviceMemory<dfloat>& o_Ax){
+  A.SpMV(1.0, o_X, 0.0, o_Ax);
 }
 
-void amgLevel::Operator(occa::memory& o_X, occa::memory& o_Ax){
-  A->SpMV(1.0, o_X, 0.0, o_Ax);
+void amgLevel::coarsen   (deviceMemory<dfloat>& o_r, deviceMemory<dfloat>& o_Rr){
+  R.SpMV(1.0, o_r, 0.0, o_Rr);
 }
 
-void amgLevel::coarsen   (occa::memory& o_r, occa::memory& o_Rr){
-  R->SpMV(1.0, o_r, 0.0, o_Rr);
+void amgLevel::prolongate(deviceMemory<dfloat>& o_X, deviceMemory<dfloat>& o_Px){
+  P.SpMV(1.0, o_X, 1.0, o_Px);
 }
 
-void amgLevel::prolongate(occa::memory& o_X, occa::memory& o_Px){
-  P->SpMV(1.0, o_X, 1.0, o_Px);
+void amgLevel::residual  (deviceMemory<dfloat>& o_RHS, deviceMemory<dfloat>& o_X,
+                          deviceMemory<dfloat>& o_RES) {
+  A.SpMV(-1.0, o_X, 1.0, o_RHS, o_RES);
 }
 
-void amgLevel::residual  (occa::memory& o_RHS, occa::memory& o_X,
-                          occa::memory& o_RES) {
-  A->SpMV(-1.0, o_X, 1.0, o_RHS, o_RES);
-}
-
-void amgLevel::smooth(occa::memory& o_RHS, occa::memory& o_X, bool x_is_zero){
+void amgLevel::smooth(deviceMemory<dfloat>& o_RHS, deviceMemory<dfloat>& o_X, bool x_is_zero){
   if(stype == DAMPED_JACOBI){
-    A->smoothDampedJacobi(o_RHS, o_X, lambda,
+    A.smoothDampedJacobi(o_RHS, o_X, lambda,
                           x_is_zero, o_scratch);
   } else if(stype == CHEBYSHEV){
-    A->smoothChebyshev(o_RHS, o_X, lambda0, lambda1,
+    A.smoothChebyshev(o_RHS, o_X, lambda0, lambda1,
                        x_is_zero, o_scratch,
                        ChebyshevIterations);
   }
@@ -80,61 +76,51 @@ void amgLevel::smooth(occa::memory& o_RHS, occa::memory& o_X, bool x_is_zero){
 void amgLevel::setupSmoother(){
 
   if (stype == DAMPED_JACOBI) {
-    lambda = (4./3.)/A->rho;
+    lambda = (4./3.)/A.rho;
   } else if (stype == CHEBYSHEV) {
-    lambda1 = A->rho;
-    lambda0 = A->rho/10.;
+    lambda1 = A.rho;
+    lambda0 = A.rho/10.;
   }
 }
 
 void amgLevel::syncToDevice(){
-  A->syncToDevice();
-  if (P) P->syncToDevice();
-  if (R) R->syncToDevice();
+  if (A.Nrows>0) A.syncToDevice();
+  if (P.Nrows>0) P.syncToDevice();
+  if (R.Nrows>0) R.syncToDevice();
 }
 
 void amgLevel::Report() {
 
   //This setup can be called by many subcommunicators, so only
   // print on the global root.
-  int rank;
-  MPI_Comm_rank(A->comm, &rank);
-
-  hlong hNrows = (hlong) Nrows;
+  int totalActive=(Nrows>0) ? 1:0;
+  A.comm.Allreduce(totalActive);
 
-  int active = (Nrows>0) ? 1:0;
-  int totalActive=0;
-  MPI_Allreduce(&active, &totalActive, 1, MPI_INT, MPI_SUM, A->comm);
+  dlong minNrows=Nrows, maxNrows=Nrows;
+  hlong totalNrows=Nrows;
+  A.comm.Allreduce(maxNrows, Comm::Max);
+  A.comm.Allreduce(totalNrows, Comm::Sum);
+  dfloat avgNrows = (dfloat) totalNrows/totalActive;
 
-  dlong minNrows=0, maxNrows=0;
-  hlong totalNrows=0;
-  dfloat avgNrows;
-  MPI_Allreduce(&Nrows, &maxNrows, 1, MPI_DLONG, MPI_MAX, A->comm);
-  MPI_Allreduce(&hNrows, &totalNrows, 1, MPI_HLONG, MPI_SUM, A->comm);
-  avgNrows = (dfloat) totalNrows/totalActive;
+  if (Nrows==0) minNrows=maxNrows; //set this so it's ignored for the global min
+  A.comm.Allreduce(minNrows, Comm::Min);
 
-  if (Nrows==0) Nrows=maxNrows; //set this so it's ignored for the global min
-  MPI_Allreduce(&Nrows, &minNrows, 1, MPI_DLONG, MPI_MIN, A->comm);
+  long long int nnz = A.diag.nnz+A.offd.nnz;
+  long long int minNnz=nnz, maxNnz=nnz, totalNnz=nnz;
+  A.comm.Allreduce(maxNnz, Comm::Max);
+  A.comm.Allreduce(totalNnz, Comm::Sum);
 
-
-  long long int nnz;
-  nnz = A->diag.nnz+A->offd.nnz;
-
-  long long int minNnz=0, maxNnz=0, totalNnz=0;
-  MPI_Allreduce(&nnz, &maxNnz,   1, MPI_LONG_LONG_INT, MPI_MAX, A->comm);
-  MPI_Allreduce(&nnz, &totalNnz, 1, MPI_LONG_LONG_INT, MPI_SUM, A->comm);
-
-  if (nnz==0) nnz = maxNnz; //set this so it's ignored for the global min
-  MPI_Allreduce(&nnz, &minNnz, 1, MPI_LONG_LONG_INT, MPI_MIN, A->comm);
+  if (nnz==0) minNnz = maxNnz; //set this so it's ignored for the global min
+  A.comm.Allreduce(minNnz, Comm::Min);
 
   dfloat nnzPerRow = (Nrows==0) ? 0 : (dfloat) nnz/Nrows;
-  dfloat minNnzPerRow=0, maxNnzPerRow=0, avgNnzPerRow=0;
-  MPI_Allreduce(&nnzPerRow, &maxNnzPerRow, 1, MPI_DFLOAT, MPI_MAX, A->comm);
-  MPI_Allreduce(&nnzPerRow, &avgNnzPerRow, 1, MPI_DFLOAT, MPI_SUM, A->comm);
+  dfloat minNnzPerRow=nnzPerRow, maxNnzPerRow=nnzPerRow, avgNnzPerRow=nnzPerRow;
+  A.comm.Allreduce(maxNnzPerRow, Comm::Max);
+  A.comm.Allreduce(avgNnzPerRow, Comm::Sum);
   avgNnzPerRow /= totalActive;
 
-  if (Nrows==0) nnzPerRow = maxNnzPerRow;
-  MPI_Allreduce(&nnzPerRow, &minNnzPerRow, 1, MPI_DFLOAT, MPI_MIN, A->comm);
+  if (Nrows==0) minNnzPerRow = maxNnzPerRow;
+  A.comm.Allreduce(minNnzPerRow, Comm::Min);
 
   char smootherString[BUFSIZ];
   if (stype==DAMPED_JACOBI)
@@ -142,11 +128,13 @@ void amgLevel::Report() {
   else if (stype==CHEBYSHEV)
     strcpy(smootherString, "Chebyshev       ");
 
-  if (rank==0){
+  if (comm.rank()==0){
     printf(      "|  parAlmond |  %12lld  |  %12d  | %13d   |   %s|\n", (long long int) totalNrows, minNrows, (int)minNnzPerRow, smootherString);
     printf("      |            |                |  %12d  | %13d   |                   |\n", maxNrows, (int)maxNnzPerRow);
     printf("      |            |                |  %12d  | %13d   |                   |\n", (int)avgNrows, (int)avgNnzPerRow);
   }
 }
 
-} //namespace parAlmond
\ No newline at end of file
+} //namespace parAlmond
+
+} //namespace libp
diff --git a/libs/parAlmond/parAlmondAMGSetup.cpp b/libs/parAlmond/parAlmondAMGSetup.cpp
index a3091d2dc..ab4287764 100644
--- a/libs/parAlmond/parAlmondAMGSetup.cpp
+++ b/libs/parAlmond/parAlmondAMGSetup.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,58 +26,65 @@ SOFTWARE.
 
 #include "parAlmond.hpp"
 #include "parAlmond/parAlmondAMGSetup.hpp"
+#include "parAlmond/parAlmondCoarseSolver.hpp"
+
+namespace libp {
 
 namespace parAlmond {
 
 void parAlmond_t::AMGSetup(parCOO& cooA,
                          bool nullSpace,
-                         dfloat *nullVector,
+                         memory<dfloat> nullVector,
                          dfloat nullSpacePenalty){
 
-  int rank;
-  int size;
-  MPI_Comm_rank(cooA.comm, &rank);
-  MPI_Comm_size(cooA.comm, &size);
+  int rank = cooA.comm.rank();
+  int size = cooA.comm.size();
+
+  if(Comm::World().rank()==0) {printf("Setting up AMG...");fflush(stdout);}
+
+  /*Get multigrid solver*/
+  multigrid_t& mg = *multigrid;
 
-  if(rank==0) {printf("Setting up AMG...");fflush(stdout);}
+  /*Get coarse solver*/
+  coarseSolver_t& coarse = *(mg.coarseSolver);
 
   //make csr matrix from coo input
-  parCSR *A = new parCSR(cooA);
-  A->diagSetup();
+  parCSR A(cooA);
+  A.diagSetup();
 
   //copy fine nullvector
-  dfloat *null = (dfloat *) malloc(A->Nrows*sizeof(dfloat));
-  memcpy(null, nullVector, A->Nrows*sizeof(dfloat));
+  memory<dfloat> null(A.Nrows);
+  null.copyFrom(nullVector, A.Nrows);
 
   // find target N at coarsest level
-  const int gCoarseSize = multigrid->coarseSolver->getTargetSize();
-
-  amgLevel *L = new amgLevel(A, settings);
+  const int gCoarseSize = coarse.getTargetSize();
 
   hlong globalSize;
-  if (multigrid->coarsetype==COARSEEXACT) {
-    globalSize = L->A->globalRowStarts[size];
+  if (mg.coarsetype==COARSEEXACT) {
+    globalSize = A.globalRowStarts[size];
   } else { //COARSEOAS
     //OAS cares about Ncols for size
-    hlong localSize = A->Ncols;
-    MPI_Allreduce(&localSize,&globalSize,1,MPI_HLONG,MPI_SUM,A->comm);
+    globalSize = A.Ncols;
+    A.comm.Allreduce(globalSize);
   }
 
+  amgLevel& Lbase = mg.AddLevel<amgLevel>(A, settings);
+
   //if the system if already small, dont create MG levels
   bool done = false;
   if(globalSize <= gCoarseSize){
-    multigrid->AddLevel(L);
-    multigrid->coarseSolver->setup(A, nullSpace, null, nullSpacePenalty);
-    multigrid->coarseSolver->syncToDevice();
-    multigrid->baseLevel = multigrid->numLevels-1;
-    L->syncToDevice();
+    mg.AllocateLevelWorkSpace(mg.numLevels-1);
+    coarse.setup(A, nullSpace, null, nullSpacePenalty);
+    coarse.syncToDevice();
+    mg.baseLevel = mg.numLevels-1;
+    Lbase.syncToDevice();
     done = true;
   }
 
   //TODO: make the coarsen threasholds user-provided inputs
-  // For now, let default to some sensible threasholds
+  // For now, let default to some sensible thresholds
   dfloat theta=0.0;
-  if (multigrid->strtype==RUGESTUBEN) {
+  if (mg.strtype==RUGESTUBEN) {
     theta=0.5; //default for 3D problems
     //See: A GPU accelerated aggregation algebraic multigrid method, R. Gandham, K. Esler, Y. Zhang.
   } else { // (type==SYMMETRIC)
@@ -86,49 +93,56 @@ void parAlmond_t::AMGSetup(parCOO& cooA,
   }
 
   while(!done){
-    L->setupSmoother();
+    /*Get current coarsest level*/
+    amgLevel& L = mg.GetLevel<amgLevel>(mg.numLevels-1);
+
+    /*Build smoother*/
+    L.setupSmoother();
 
-    // Create coarse level via AMG. Coarsen null vector
-    amgLevel* Lcoarse = coarsenAmgLevel(L, null,
-                                        multigrid->strtype, theta,
-                                        multigrid->aggtype);
-    multigrid->AddLevel(L);
-    L->syncToDevice();
+    /*Create new level*/
+    amgLevel& Lcoarse = mg.AddLevel<amgLevel>();
+
+    /* Coarsen level via AMG. Coarsen null vector */
+    Lcoarse = coarsenAmgLevel(L, null,
+                              mg.strtype, theta,
+                              mg.aggtype);
+
+    mg.AllocateLevelWorkSpace(mg.numLevels-2);
+    L.syncToDevice();
+
+    parCSR& Acoarse = Lcoarse.A;
 
     // Increase coarsening rate as we add levels.
     //See: Algebraic Multigrid On Unstructured Meshes, P Vanek, J. Mandel, M. Brezina.
-    if (multigrid->strtype==SYMMETRIC)
+    if (mg.strtype==SYMMETRIC)
       theta=theta/2;
 
     hlong globalCoarseSize;
-    if (multigrid->coarsetype==COARSEEXACT) {
-      globalCoarseSize = Lcoarse->A->globalRowStarts[size];;
+    if (mg.coarsetype==COARSEEXACT) {
+      globalCoarseSize = Acoarse.globalRowStarts[size];;
     } else { //COARSEOAS
       //OAS cares about Ncols for size
-      hlong localSize = Lcoarse->A->Ncols;
-      MPI_Allreduce(&localSize,&globalCoarseSize,1,MPI_HLONG,MPI_SUM,Lcoarse->A->comm);
+      globalCoarseSize = Acoarse.Ncols;
+      Acoarse.comm.Allreduce(globalCoarseSize);
     }
 
     if(globalCoarseSize <= gCoarseSize || globalSize < 2*globalCoarseSize){
-      if (globalSize < 2*globalCoarseSize && rank==0) {
-        stringstream ss;
-        ss << "AMG coarsening stalling, attemping coarse solver setup with dimension N=" << globalCoarseSize;
-        LIBP_WARNING(ss.str());
-      }
-      multigrid->AddLevel(Lcoarse);
-      Lcoarse->syncToDevice();
-      multigrid->coarseSolver->setup(Lcoarse->A, nullSpace, null, nullSpacePenalty);
-      multigrid->coarseSolver->syncToDevice();
-      multigrid->baseLevel = multigrid->numLevels-1;
+      LIBP_WARNING("AMG coarsening stalling, attemping coarse solver setup with dimension N=" << globalCoarseSize,
+                   globalSize < 2*globalCoarseSize && rank==0);
+
+      mg.AllocateLevelWorkSpace(mg.numLevels-1);
+      Lcoarse.syncToDevice();
+      coarse.setup(Acoarse, nullSpace, null, nullSpacePenalty);
+      coarse.syncToDevice();
+      mg.baseLevel = mg.numLevels-1;
       break;
     }
     globalSize = globalCoarseSize;
-    L = Lcoarse;
   }
 
-  free(null);
-
-  if(rank==0) printf("done.\n");
+  if(Comm::World().rank()==0) printf("done.\n");
 }
 
 } //namespace parAlmond
+
+} //namespace libp
diff --git a/libs/parAlmond/parAlmondAMGSmoother.cpp b/libs/parAlmond/parAlmondAMGSmoother.cpp
index 360799210..493f8a11c 100644
--- a/libs/parAlmond/parAlmondAMGSmoother.cpp
+++ b/libs/parAlmond/parAlmondAMGSmoother.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,48 +25,49 @@ SOFTWARE.
 */
 
 #include "parAlmond.hpp"
-#include "parAlmond/parAlmondMultigrid.hpp"
 #include "parAlmond/parAlmondAMGLevel.hpp"
 #include "parAlmond/parAlmondKernels.hpp"
 
+namespace libp {
+
 namespace parAlmond {
 
-void parCSR::smoothDampedJacobi(occa::memory& o_r, occa::memory& o_x,
+void parCSR::smoothDampedJacobi(deviceMemory<dfloat>& o_r, deviceMemory<dfloat>& o_x,
                                 const dfloat lambda, bool x_is_zero,
-                                occa::memory& o_scratch){
+                                deviceMemory<dfloat>& o_scratch){
 
   if(x_is_zero){
     // x = lambda*inv(D)*r
-    platform.linAlg.amxpy(Nrows, lambda, o_diagInv, o_r, 0.0, o_x);
+    platform.linAlg().amxpy(Nrows, lambda, o_diagInv, o_r, 0.0, o_x);
     return;
   }
 
-  occa::memory o_d = o_scratch;
+  deviceMemory<dfloat> o_d = o_scratch;
 
-  halo->ExchangeStart(o_x, 1, ogs_dfloat);
+  halo.ExchangeStart(o_x, 1);
 
   // d = lambda*inv(D)*(r-A*x)
   if (diag.NrowBlocks)
     SmoothJacobiCSRKernel(diag.NrowBlocks,
-                   diag.o_blockRowStarts, diag.o_rowStarts,
-                   diag.o_cols, diag.o_vals,
-                   lambda, o_diagInv,
-                   o_r, o_x, o_d);
+                         diag.o_blockRowStarts, diag.o_rowStarts,
+                         diag.o_cols, diag.o_vals,
+                         lambda, o_diagInv,
+                         o_r, o_x, o_d);
 
-  halo->ExchangeFinish(o_x, 1, ogs_dfloat);
+  halo.ExchangeFinish(o_x, 1);
 
   if (offd.NrowBlocks)
     SmoothJacobiMCSRKernel(offd.NrowBlocks,
-                   offd.o_blockRowStarts, offd.o_mRowStarts,
-                   offd.o_rows, offd.o_cols, offd.o_vals,
-                   lambda, o_diagInv, o_x, o_d);
+                           offd.o_blockRowStarts, offd.o_mRowStarts,
+                           offd.o_rows, offd.o_cols, offd.o_vals,
+                           lambda, o_diagInv, o_x, o_d);
 
-  platform.linAlg.axpy(Nrows, 1.0, o_d, 1.0, o_x);
+  platform.linAlg().axpy(Nrows, 1.0, o_d, 1.0, o_x);
 }
 
-void parCSR::smoothChebyshev(occa::memory& o_b, occa::memory& o_x,
+void parCSR::smoothChebyshev(deviceMemory<dfloat>& o_b, deviceMemory<dfloat>& o_x,
                              const dfloat lambda0, const dfloat lambda1,
-                             bool x_is_zero, occa::memory& o_scratch,
+                             bool x_is_zero, deviceMemory<dfloat>& o_scratch,
                              const int ChebyshevIterations) {
 
   const dfloat theta = 0.5*(lambda1+lambda0);
@@ -76,8 +77,8 @@ void parCSR::smoothChebyshev(occa::memory& o_b, occa::memory& o_x,
   dfloat rho_n = 1./sigma;
   dfloat rho_np1;
 
-  occa::memory o_d = o_scratch + 0*Ncols*sizeof(dfloat);
-  occa::memory o_r = o_scratch + 1*Ncols*sizeof(dfloat);
+  deviceMemory<dfloat> o_d = o_scratch + 0*Ncols;
+  deviceMemory<dfloat> o_r = o_scratch + 1*Ncols;
 
 
   if(x_is_zero){ //skip the Ax if x is zero
@@ -89,25 +90,25 @@ void parCSR::smoothChebyshev(occa::memory& o_b, occa::memory& o_x,
                                  o_b, o_r, o_d, o_x);
   } else {
     //r = D^{-1}(b-A*x)
-    halo->ExchangeStart(o_x, 1, ogs_dfloat);
+    halo.ExchangeStart(o_x, 1);
 
     const dfloat alpha = 0.0;
     const dfloat beta = 1.0;
 
     if (diag.NrowBlocks)
       SmoothChebyshevCSRKernel(diag.NrowBlocks,
-                     diag.o_blockRowStarts, diag.o_rowStarts,
-                     diag.o_cols, diag.o_vals,
-                     alpha, beta, o_diagInv,
-                     o_b, o_x, o_r);
+                               diag.o_blockRowStarts, diag.o_rowStarts,
+                               diag.o_cols, diag.o_vals,
+                               alpha, beta, o_diagInv,
+                               o_b, o_x, o_r);
 
-    halo->ExchangeFinish(o_x, 1, ogs_dfloat);
+    halo.ExchangeFinish(o_x, 1);
 
     if (offd.NrowBlocks)
       SmoothChebyshevMCSRKernel(offd.NrowBlocks,
-                     offd.o_blockRowStarts, offd.o_mRowStarts,
-                     offd.o_rows, offd.o_cols, offd.o_vals,
-                     o_diagInv, o_x, o_r);
+                               offd.o_blockRowStarts, offd.o_mRowStarts,
+                               offd.o_rows, offd.o_cols, offd.o_vals,
+                               o_diagInv, o_x, o_r);
 
     const int last_it = (ChebyshevIterations==0) ? 1 : 0;
 
@@ -124,23 +125,22 @@ void parCSR::smoothChebyshev(occa::memory& o_b, occa::memory& o_x,
     const dfloat beta = 0.0;
 
     //r_k+1 = r_k - D^{-1}Ad_k
-    halo->ExchangeStart(o_d, 1, ogs_dfloat);
+    halo.ExchangeStart(o_d, 1);
 
     if (diag.NrowBlocks)
       SmoothChebyshevCSRKernel(diag.NrowBlocks,
-                     diag.o_blockRowStarts, diag.o_rowStarts,
-                     diag.o_cols, diag.o_vals,
-                     alpha, beta, o_diagInv,
-                     o_b, o_d, o_r);
+                               diag.o_blockRowStarts, diag.o_rowStarts,
+                               diag.o_cols, diag.o_vals,
+                               alpha, beta, o_diagInv,
+                               o_b, o_d, o_r);
 
-    halo->ExchangeFinish(o_d, 1, ogs_dfloat);
+    halo.ExchangeFinish(o_d, 1);
 
     if (offd.NrowBlocks)
       SmoothChebyshevMCSRKernel(offd.NrowBlocks,
-                     offd.o_blockRowStarts, offd.o_mRowStarts,
-                     offd.o_rows, offd.o_cols, offd.o_vals,
-                     o_diagInv, o_d, o_r);
-
+                               offd.o_blockRowStarts, offd.o_mRowStarts,
+                               offd.o_rows, offd.o_cols, offd.o_vals,
+                               o_diagInv, o_d, o_r);
 
     const int last_it = (k==ChebyshevIterations-1) ? 1 : 0;
 
@@ -151,7 +151,7 @@ void parCSR::smoothChebyshev(occa::memory& o_b, occa::memory& o_x,
     if (Nrows)
       SmoothChebyshevUpdateKernel(Nrows,
                                   rho_np1*rho_n,
-                                  2.0*rho_np1/delta,
+                                  dfloat(2.0)*rho_np1/delta,
                                   last_it,
                                   o_r, o_d, o_x);
 
@@ -160,3 +160,5 @@ void parCSR::smoothChebyshev(occa::memory& o_b, occa::memory& o_x,
 }
 
 } //namespace parAlmond
+
+} //namespace libp
diff --git a/libs/parAlmond/parAlmondCoarseExact.cpp b/libs/parAlmond/parAlmondCoarseExact.cpp
index 74324682f..9792216f6 100644
--- a/libs/parAlmond/parAlmondCoarseExact.cpp
+++ b/libs/parAlmond/parAlmondCoarseExact.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -28,23 +28,20 @@ SOFTWARE.
 #include "parAlmond/parAlmondCoarseSolver.hpp"
 #include "parAlmond/parAlmondKernels.hpp"
 
-//link in the data stream from ogs
-namespace ogs {
-  extern occa::stream dataStream;
-}
+namespace libp {
 
 namespace parAlmond {
 
-void exactSolver_t::solve(occa::memory& o_rhs, occa::memory& o_x) {
+void exactSolver_t::solve(deviceMemory<dfloat>& o_rhs, deviceMemory<dfloat>& o_x) {
 
-  occa::stream currentStream = platform.device.getStream();
+  stream_t currentStream = platform.getStream();
 
   //queue transfering coarse vector to host for Allgather
   if(N) {
-    platform.device.finish();
-    platform.device.setStream(ogs::dataStream);
-    o_rhs.copyTo(diagRhs, N*sizeof(dfloat), 0, "async: true");
-    platform.device.setStream(currentStream);
+    platform.finish();
+    platform.setStream(ogs::ogsBase_t::dataStream);
+    o_rhs.copyTo(diagRhs, N, 0, properties_t("async", true));
+    platform.setStream(currentStream);
   }
 
   //queue local part of gemv
@@ -55,19 +52,19 @@ void exactSolver_t::solve(occa::memory& o_rhs, occa::memory& o_x) {
 
   if(offdTotal) {
     //wait for data to arrive on host
-    platform.device.setStream(ogs::dataStream);
-    platform.device.finish();
+    platform.setStream(ogs::ogsBase_t::dataStream);
+    platform.finish();
 
 
     //gather the offd rhs entries
-    MPI_Alltoallv(diagRhs,   sendCounts,   sendOffsets, MPI_DFLOAT,
-                  offdRhs, coarseCounts, coarseOffsets, MPI_DFLOAT, comm);
+    comm.Alltoallv(diagRhs,   sendCounts,   sendOffsets,
+                   offdRhs, coarseCounts, coarseOffsets);
 
     //queue transfering coarse vector to device
-    o_offdRhs.copyFrom(offdRhs, offdTotal*sizeof(dfloat), 0, "async: true");
-    platform.device.finish(); //wait for transfer to complete
+    o_offdRhs.copyFrom(offdRhs, offdTotal, 0, properties_t("async", true));
+    platform.finish(); //wait for transfer to complete
 
-    platform.device.setStream(currentStream);
+    platform.setStream(currentStream);
 
     //queue offd part of gemv
     if (N)
@@ -80,65 +77,66 @@ int exactSolver_t::getTargetSize() {
   return 1000;
 }
 
-void exactSolver_t::setup(parCSR *_A, bool nullSpace,
-                           dfloat *nullVector, dfloat nullSpacePenalty) {
+void exactSolver_t::setup(parCSR& _A, bool nullSpace,
+                          memory<dfloat> nullVector, dfloat nullSpacePenalty) {
 
   A = _A;
 
-  comm = A->comm;
-  MPI_Comm_rank(comm, &rank);
-  MPI_Comm_size(comm, &size);
+  comm = A.comm;
+  rank = comm.rank();
+  size = comm.size();
 
   //copy the global coarse partition as ints
-  coarseOffsets = (int* ) calloc(size+1,sizeof(int));
-  for (int r=0;r<size+1;r++) coarseOffsets[r] = (int) A->globalRowStarts[r];
+  coarseOffsets.malloc(size+1);
+  for (int r=0;r<size+1;r++) {
+    coarseOffsets[r] = static_cast<int>(A.globalRowStarts[r]);
+  }
 
   coarseTotal   = coarseOffsets[size];
   coarseOffset  = coarseOffsets[rank];
 
-  N = (int) A->Nrows;
-  Nrows = A->Nrows;
-  Ncols = A->Ncols;
+  N = static_cast<int>(A.Nrows);
+  Nrows = A.Nrows;
+  Ncols = A.Ncols;
 
-  coarseCounts = (int*) calloc(size,sizeof(int));
+  coarseCounts.malloc(size,0);
 
-  int sendNNZ = (int) (A->diag.nnz+A->offd.nnz);
+  int sendNNZ = static_cast<int>(A.diag.nnz+A.offd.nnz);
 
   // if((rank==0)&&(settings.compareSetting("VERBOSE","TRUE")))
   //   {printf("Setting up coarse solver...");fflush(stdout);}
 
-  parCOO::nonZero_t *sendNonZeros = (parCOO::nonZero_t *) calloc(sendNNZ, sizeof(parCOO::nonZero_t));
+  memory<parCOO::nonZero_t> sendNonZeros(sendNNZ);
 
   //populate matrix
   int cnt = 0;
   for (int n=0;n<N;n++) {
-    const int start = (int) A->diag.rowStarts[n];
-    const int end   = (int) A->diag.rowStarts[n+1];
+    const int start = static_cast<int>(A.diag.rowStarts[n]);
+    const int end   = static_cast<int>(A.diag.rowStarts[n+1]);
     for (int m=start;m<end;m++) {
       sendNonZeros[cnt].row = n + coarseOffset;
-      sendNonZeros[cnt].col = A->diag.cols[m] + coarseOffset;
-      sendNonZeros[cnt].val = A->diag.vals[m];
+      sendNonZeros[cnt].col = A.diag.cols[m] + coarseOffset;
+      sendNonZeros[cnt].val = A.diag.vals[m];
       cnt++;
     }
   }
 
-  for (int n=0;n<A->offd.nzRows;n++) {
-    const int row   = (int) A->offd.rows[n];
-    const int start = (int) A->offd.mRowStarts[n];
-    const int end   = (int) A->offd.mRowStarts[n+1];
+  for (int n=0;n<A.offd.nzRows;n++) {
+    const int row   = static_cast<int>(A.offd.rows[n]);
+    const int start = static_cast<int>(A.offd.mRowStarts[n]);
+    const int end   = static_cast<int>(A.offd.mRowStarts[n+1]);
     for (int m=start;m<end;m++) {
       sendNonZeros[cnt].row = row + coarseOffset;
-      sendNonZeros[cnt].col = A->colMap[A->offd.cols[m]];
-      sendNonZeros[cnt].val = A->offd.vals[m];
+      sendNonZeros[cnt].col = A.colMap[A.offd.cols[m]];
+      sendNonZeros[cnt].val = A.offd.vals[m];
       cnt++;
     }
   }
 
   //get the nonzero counts from all ranks
-  int *recvNNZ    = (int*) calloc(size,sizeof(int));
-  int *NNZoffsets = (int*) calloc(size+1,sizeof(int));
-  MPI_Allgather(&sendNNZ, 1, MPI_INT,
-                 recvNNZ, 1, MPI_INT, comm);
+  memory<int> recvNNZ(size);
+  memory<int> NNZoffsets(size+1,0);
+  comm.Allgather(sendNNZ, recvNNZ);
 
   int totalNNZ = 0;
   for (int r=0;r<size;r++) {
@@ -146,29 +144,23 @@ void exactSolver_t::setup(parCSR *_A, bool nullSpace,
     NNZoffsets[r+1] = NNZoffsets[r] + recvNNZ[r];
   }
 
-  parCOO::nonZero_t *recvNonZeros = (parCOO::nonZero_t *) calloc(totalNNZ, sizeof(parCOO::nonZero_t));
+  memory<parCOO::nonZero_t> recvNonZeros(totalNNZ);
 
-  MPI_Allgatherv(sendNonZeros, sendNNZ,             MPI_NONZERO_T,
-                 recvNonZeros, recvNNZ, NNZoffsets, MPI_NONZERO_T, comm);
+  comm.Allgatherv(sendNonZeros, sendNNZ,
+                  recvNonZeros, recvNNZ, NNZoffsets);
 
   //gather null vector
-  dfloat *nullTotal = (dfloat*) calloc(coarseTotal,sizeof(dfloat));
+  memory<dfloat> nullTotal(coarseTotal);
 
-  for (int r=0;r<size;r++)
+  for (int r=0;r<size;r++) {
     coarseCounts[r] = coarseOffsets[r+1]-coarseOffsets[r];
+  }
 
-  MPI_Allgatherv(nullVector,            N,                MPI_DFLOAT,
-                  nullTotal, coarseCounts, coarseOffsets, MPI_DFLOAT,
-                 comm);
-
-  //clean up
-  MPI_Barrier(comm);
-  free(sendNonZeros);
-  free(NNZoffsets);
-  free(recvNNZ);
+  comm.Allgatherv(nullVector, N,
+                  nullTotal, coarseCounts, coarseOffsets);
 
   //assemble the full matrix
-  dfloat *coarseA = (dfloat *) calloc(coarseTotal*coarseTotal,sizeof(dfloat));
+  memory<dfloat> coarseA(coarseTotal*coarseTotal, 0.0);
   for (int i=0;i<totalNNZ;i++) {
     int n = recvNonZeros[i].row;
     int m = recvNonZeros[i].col;
@@ -183,10 +175,7 @@ void exactSolver_t::setup(parCSR *_A, bool nullSpace,
     }
   }
 
-  free(recvNonZeros);
-  free(nullTotal);
-
-  matrixInverse(coarseTotal, coarseA);
+  linAlg_t::matrixInverse(coarseTotal, coarseA);
 
   //determine size of offd piece
   offdTotal = coarseTotal - N;
@@ -199,8 +188,8 @@ void exactSolver_t::setup(parCSR *_A, bool nullSpace,
   coarseCounts[rank]=0;
 
   //counts for all-to-all
-  sendCounts = (int* ) calloc(size,sizeof(int));
-  sendOffsets = (int* ) calloc(size,sizeof(int));
+  sendCounts.malloc(size);
+  sendOffsets.malloc(size);
   for (int r=0;r<size;r++) {
     sendCounts[r] = N;
     sendOffsets[r] = 0;
@@ -208,7 +197,7 @@ void exactSolver_t::setup(parCSR *_A, bool nullSpace,
   sendCounts[rank] = 0;
 
   //diag piece of invA
-  diagInvAT = (dfloat *) calloc(N*N,sizeof(dfloat));
+  diagInvAT.malloc(N*N);
   for (int n=0;n<N;n++) {
     for (int m=0;m<N;m++) {
       diagInvAT[n+m*N] = coarseA[(n+coarseOffset)*coarseTotal+(m+coarseOffset)];
@@ -216,7 +205,7 @@ void exactSolver_t::setup(parCSR *_A, bool nullSpace,
   }
 
   //offd piece of invA
-  offdInvAT = (dfloat *) calloc(N*offdTotal,sizeof(dfloat));
+  offdInvAT.malloc(N*offdTotal);
   for (int n=0;n<N;n++) {
     for (int m=0;m<coarseOffset;m++) {
       offdInvAT[n+m*N] = coarseA[(n+coarseOffset)*coarseTotal+m];
@@ -226,13 +215,13 @@ void exactSolver_t::setup(parCSR *_A, bool nullSpace,
     }
   }
 
-  o_diagInvAT = platform.malloc(N*N*sizeof(dfloat), diagInvAT);
-  o_offdInvAT = platform.malloc(N*offdTotal*sizeof(dfloat), offdInvAT);
+  o_diagInvAT = platform.malloc<dfloat>(diagInvAT);
+  o_offdInvAT = platform.malloc<dfloat>(offdInvAT);
 
-  diagRhs = (dfloat*) calloc(N,sizeof(dfloat));
-  offdRhs = (dfloat*) calloc(offdTotal,sizeof(dfloat));
+  diagRhs.malloc(N);
+  offdRhs.malloc(offdTotal);
 
-  o_offdRhs = platform.malloc(offdTotal*sizeof(dfloat));
+  o_offdRhs = platform.malloc<dfloat>(offdTotal);
 
   // if((rank==0)&&(settings.compareSetting("VERBOSE","TRUE"))) printf("done.\n");
 }
@@ -241,40 +230,36 @@ void exactSolver_t::syncToDevice() {}
 
 void exactSolver_t::Report(int lev) {
 
-  hlong hNrows = (hlong) N;
-
-  int active = (N>0) ? 1:0;
-  int totalActive=0;
-  MPI_Allreduce(&active, &totalActive, 1, MPI_INT, MPI_SUM, comm);
+  int totalActive = (N>0) ? 1:0;
+  comm.Allreduce(totalActive, Comm::Sum);
 
-  dlong minNrows=0, maxNrows=0;
-  hlong totalNrows=0;
-  dfloat avgNrows;
-  MPI_Allreduce(&N, &maxNrows, 1, MPI_DLONG, MPI_MAX, comm);
-  MPI_Allreduce(&hNrows, &totalNrows, 1, MPI_HLONG, MPI_SUM, comm);
-  avgNrows = (dfloat) totalNrows/totalActive;
+  dlong minNrows=N, maxNrows=N;
+  hlong totalNrows=N;
+  comm.Allreduce(maxNrows, Comm::Max);
+  comm.Allreduce(totalNrows, Comm::Sum);
+  dfloat avgNrows = (dfloat) totalNrows/totalActive;
 
-  if (N==0) N=maxNrows; //set this so it's ignored for the global min
-  MPI_Allreduce(&N, &minNrows, 1, MPI_DLONG, MPI_MIN, comm);
+  if (N==0) minNrows=maxNrows; //set this so it's ignored for the global min
+  comm.Allreduce(minNrows, Comm::Min);
 
   long long int nnz;
-  nnz = A->diag.nnz+A->offd.nnz;
+  nnz = A.diag.nnz+A.offd.nnz;
 
-  long long int minNnz=0, maxNnz=0, totalNnz=0;
-  MPI_Allreduce(&nnz, &maxNnz,   1, MPI_LONG_LONG_INT, MPI_MAX, A->comm);
-  MPI_Allreduce(&nnz, &totalNnz, 1, MPI_LONG_LONG_INT, MPI_SUM, A->comm);
+  long long int minNnz=nnz, maxNnz=nnz, totalNnz=nnz;
+  comm.Allreduce(maxNnz,   Comm::Max);
+  comm.Allreduce(totalNnz, Comm::Sum);
 
-  if (nnz==0) nnz = maxNnz; //set this so it's ignored for the global min
-  MPI_Allreduce(&nnz, &minNnz, 1, MPI_LONG_LONG_INT, MPI_MIN, A->comm);
+  if (nnz==0) minNnz = maxNnz; //set this so it's ignored for the global min
+  comm.Allreduce(minNnz, Comm::Min);
 
   dfloat nnzPerRow = (Nrows==0) ? 0 : (dfloat) nnz/Nrows;
-  dfloat minNnzPerRow=0, maxNnzPerRow=0, avgNnzPerRow=0;
-  MPI_Allreduce(&nnzPerRow, &maxNnzPerRow, 1, MPI_DFLOAT, MPI_MAX, A->comm);
-  MPI_Allreduce(&nnzPerRow, &avgNnzPerRow, 1, MPI_DFLOAT, MPI_SUM, A->comm);
+  dfloat minNnzPerRow=nnzPerRow, maxNnzPerRow=nnzPerRow, avgNnzPerRow=nnzPerRow;
+  comm.Allreduce(maxNnzPerRow, Comm::Max);
+  comm.Allreduce(avgNnzPerRow, Comm::Sum);
   avgNnzPerRow /= totalActive;
 
-  if (Nrows==0) nnzPerRow = maxNnzPerRow;
-  MPI_Allreduce(&nnzPerRow, &minNnzPerRow, 1, MPI_DFLOAT, MPI_MIN, A->comm);
+  if (Nrows==0) minNnzPerRow = maxNnzPerRow;
+  comm.Allreduce(minNnzPerRow, Comm::Min);
 
   std::string name = "Exact Solve     ";
 
@@ -285,13 +270,6 @@ void exactSolver_t::Report(int lev) {
   }
 }
 
-exactSolver_t::~exactSolver_t() {
-  if (coarseOffsets) free(coarseOffsets);
-  if (coarseCounts) free(coarseCounts);
-  if (diagInvAT) free(diagInvAT);
-  if (offdInvAT) free(offdInvAT);
-  if (diagRhs) free(diagRhs);
-  if (offdRhs) free(offdRhs);
-}
-
 } //namespace parAlmond
+
+} //namespace libp
diff --git a/libs/parAlmond/parAlmondCoarseOAS.cpp b/libs/parAlmond/parAlmondCoarseOAS.cpp
index 385015bda..031e15658 100644
--- a/libs/parAlmond/parAlmondCoarseOAS.cpp
+++ b/libs/parAlmond/parAlmondCoarseOAS.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -28,11 +28,13 @@ SOFTWARE.
 #include "parAlmond/parAlmondCoarseSolver.hpp"
 #include "parAlmond/parAlmondKernels.hpp"
 
+namespace libp {
+
 namespace parAlmond {
 
-void oasSolver_t::solve(occa::memory& o_rhs, occa::memory& o_x) {
+void oasSolver_t::solve(deviceMemory<dfloat>& o_rhs, deviceMemory<dfloat>& o_x) {
 
-  A->halo->ExchangeStart(o_rhs, 1, ogs_dfloat);
+  A.halo.ExchangeStart(o_rhs, 1);
 
   //queue local part of gemv
   const dfloat one=1.0;
@@ -40,34 +42,33 @@ void oasSolver_t::solve(occa::memory& o_rhs, occa::memory& o_x) {
   if (N)
     dGEMVKernel(N,diagTotal,one,o_diagInvAT,o_rhs, zero, o_x);
 
-  A->halo->ExchangeFinish(o_rhs, 1, ogs_dfloat);
+  A.halo.ExchangeFinish(o_rhs, 1);
 
   //queue offd part of gemv
   if(offdTotal && N)
     dGEMVKernel(N,offdTotal, one, o_offdInvAT,
-                o_rhs+diagTotal*sizeof(dfloat), one, o_x);
+                o_rhs+diagTotal, one, o_x);
 
-  A->halo->Combine(o_x, 1, ogs_dfloat);
+  A.halo.Combine(o_x, 1);
 }
 
 
 int oasSolver_t::getTargetSize() {
-  MPI_Comm_size(comm, &size);
-  return 1000*size;
+  return 1000*comm.size();
 }
 
-void oasSolver_t::setup(parCSR *_A, bool nullSpace,
-                        dfloat *nullVector, dfloat nullSpacePenalty) {
+void oasSolver_t::setup(parCSR& _A, bool nullSpace,
+                        memory<dfloat> nullVector, dfloat nullSpacePenalty) {
 
   A = _A;
 
-  comm = A->comm;
-  MPI_Comm_rank(comm, &rank);
-  MPI_Comm_size(comm, &size);
+  comm = A.comm;
+  rank = comm.rank();
+  size = comm.size();
 
-  N = (int) A->Ncols;
-  Nrows = A->Nrows;
-  Ncols = A->Ncols;
+  N = static_cast<int>(A.Ncols);
+  Nrows = A.Nrows;
+  Ncols = A.Ncols;
 
   // if((rank==0)&&(settings.compareSetting("VERBOSE","TRUE")))
   //   {printf("Setting up coarse solver...");fflush(stdout);}
@@ -76,25 +77,24 @@ void oasSolver_t::setup(parCSR *_A, bool nullSpace,
   // corresponding the offd columns
 
   //need to find where to send local rows
-  hlong *recvRows = (hlong *) calloc(A->Ncols-A->Nrows, sizeof(hlong));
+  memory<hlong> recvRows(A.Ncols-A.Nrows);
 
-  int *sendCounts = (int*) calloc(size, sizeof(int));
-  int *recvCounts = (int*) calloc(size, sizeof(int));
-  int *sendOffsets = (int*) calloc(size+1, sizeof(int));
-  int *recvOffsets = (int*) calloc(size+1, sizeof(int));
+  memory<int> sendCounts(size);
+  memory<int> recvCounts(size, 0);
+  memory<int> sendOffsets(size+1, 0);
+  memory<int> recvOffsets(size+1, 0);
 
   //use the colMap to fill the recv sizes
   int r=0;
-  for (int n=A->Nrows;n<A->Ncols;n++) {
-    hlong id = A->colMap[n];
-    while (id>=A->globalRowStarts[r+1]) r++; //assumes the halo is sorted
+  for (int n=A.Nrows;n<A.Ncols;n++) {
+    hlong id = A.colMap[n];
+    while (id>=A.globalRowStarts[r+1]) r++; //assumes the halo is sorted
     recvCounts[r]++;
-    recvRows[n-A->Nrows] = id; //record the row to recv
+    recvRows[n-A.Nrows] = id; //record the row to recv
   }
 
   //share the counts
-  MPI_Alltoall(recvCounts, 1, MPI_INT,
-               sendCounts, 1, MPI_INT, comm);
+  comm.Alltoall(recvCounts, sendCounts);
 
   for (r=0;r<size;r++) {
     sendOffsets[r+1] = sendOffsets[r]+sendCounts[r];
@@ -102,48 +102,46 @@ void oasSolver_t::setup(parCSR *_A, bool nullSpace,
   }
 
   int sendTotal = sendOffsets[size];
-  hlong *sendRows = (hlong *) calloc(sendTotal, sizeof(hlong));
+  memory<hlong> sendRows(sendTotal);
 
   //share the rowIds
-  MPI_Alltoallv(recvRows, recvCounts, recvOffsets, MPI_HLONG,
-                sendRows, sendCounts, sendOffsets, MPI_HLONG,
-                comm);
+  comm.Alltoallv(recvRows, recvCounts, recvOffsets,
+                 sendRows, sendCounts, sendOffsets);
 
   //we now have a list of rows to send, count the nnz to send
   dlong nnzTotal=0;
   for (r=0;r<size;r++) {
     sendCounts[r] =0; //reset
     for (int n=sendOffsets[r];n<sendOffsets[r+1];n++) {
-      dlong i = (dlong) (sendRows[n]-A->globalRowStarts[rank]); //local row id
-      sendCounts[r]+= A->diag.rowStarts[i+1]-A->diag.rowStarts[i]; //count entries in this row
-      sendCounts[r]+= A->offd.rowStarts[i+1]-A->offd.rowStarts[i]; //count entries in this row
+      dlong i = static_cast<dlong>(sendRows[n]-A.globalRowStarts[rank]); //local row id
+      sendCounts[r]+= A.diag.rowStarts[i+1]-A.diag.rowStarts[i]; //count entries in this row
+      sendCounts[r]+= A.offd.rowStarts[i+1]-A.offd.rowStarts[i]; //count entries in this row
     }
     nnzTotal += sendCounts[r]; //tally the total
   }
 
-  parCOO::nonZero_t *sendNonZeros = (parCOO::nonZero_t *) calloc(nnzTotal, sizeof(parCOO::nonZero_t));
+  memory<parCOO::nonZero_t> sendNonZeros(nnzTotal);
 
   nnzTotal=0; //reset
   for (r=0;r<size;r++) {
     for (int n=sendOffsets[r];n<sendOffsets[r+1];n++) {
-      dlong i = (dlong) (sendRows[n] - A->globalRowStarts[rank]); //local row id
-      for (dlong jj=A->diag.rowStarts[i]; jj<A->diag.rowStarts[i+1];jj++){
+      dlong i = static_cast<dlong>(sendRows[n] - A.globalRowStarts[rank]); //local row id
+      for (dlong jj=A.diag.rowStarts[i]; jj<A.diag.rowStarts[i+1];jj++){
         sendNonZeros[nnzTotal].row = sendRows[n];
-        sendNonZeros[nnzTotal].col = A->diag.cols[jj] + A->globalRowStarts[rank];
-        sendNonZeros[nnzTotal].val = A->diag.vals[jj];
+        sendNonZeros[nnzTotal].col = A.diag.cols[jj] + A.globalRowStarts[rank];
+        sendNonZeros[nnzTotal].val = A.diag.vals[jj];
         nnzTotal++;
       }
-      for (dlong jj=A->offd.rowStarts[i]; jj<A->offd.rowStarts[i+1];jj++){
+      for (dlong jj=A.offd.rowStarts[i]; jj<A.offd.rowStarts[i+1];jj++){
         sendNonZeros[nnzTotal].row = sendRows[n];
-        sendNonZeros[nnzTotal].col = A->colMap[A->offd.cols[jj]];
-        sendNonZeros[nnzTotal].val = A->offd.vals[jj];
+        sendNonZeros[nnzTotal].col = A.colMap[A.offd.cols[jj]];
+        sendNonZeros[nnzTotal].val = A.offd.vals[jj];
         nnzTotal++;
       }
     }
   }
 
-  MPI_Alltoall(sendCounts, 1, MPI_INT,
-               recvCounts, 1, MPI_INT, comm);
+  comm.Alltoall(sendCounts, recvCounts);
 
   for (r=0;r<size;r++) {
     sendOffsets[r+1] = sendOffsets[r]+sendCounts[r];
@@ -152,39 +150,30 @@ void oasSolver_t::setup(parCSR *_A, bool nullSpace,
 
   nnzTotal = recvOffsets[size]; //total nonzeros
 
-  parCOO::nonZero_t *recvNonZeros = (parCOO::nonZero_t *) calloc(nnzTotal, sizeof(parCOO::nonZero_t));
-
-  MPI_Alltoallv(sendNonZeros, sendCounts, sendOffsets, MPI_NONZERO_T,
-                recvNonZeros, recvCounts, recvOffsets, MPI_NONZERO_T,
-                comm);
+  memory<parCOO::nonZero_t> recvNonZeros(nnzTotal);
 
-  //clean up
-  MPI_Barrier(comm);
-  free(sendNonZeros);
-  free(sendCounts);
-  free(recvCounts);
-  free(sendOffsets);
-  free(recvOffsets);
+  comm.Alltoallv(sendNonZeros, sendCounts, sendOffsets,
+                 recvNonZeros, recvCounts, recvOffsets);
 
   //we now have all the nonlocal rows (should also be sorted)
 
   //first re-index the column indices
-  dlong id=A->Nrows;
+  dlong id=A.Nrows;
   for (dlong n=0;n<nnzTotal;n++) {
     const hlong row = recvNonZeros[n].row;
 
-    while(A->colMap[id]!=row) id++; //shift along list of recieved columns
+    while(A.colMap[id]!=row) id++; //shift along list of recieved columns
 
     recvNonZeros[n].row = id; //overwrite with new local row id
 
     //now check the column index
     hlong col = recvNonZeros[n].col;
-    if (col >= A->globalRowStarts[rank] && col < A->globalRowStarts[rank+1]) {//local column
-      recvNonZeros[n].col = col - A->globalRowStarts[rank];//overwrite with local col id
+    if (col >= A.globalRowStarts[rank] && col < A.globalRowStarts[rank+1]) {//local column
+      recvNonZeros[n].col = col - A.globalRowStarts[rank];//overwrite with local col id
     } else {
       int flag = 0;
-      for (dlong jj=A->Nrows;jj<A->Ncols;jj++) { //look for the right id in the halo
-        if (A->colMap[jj]==col) {
+      for (dlong jj=A.Nrows;jj<A.Ncols;jj++) { //look for the right id in the halo
+        if (A.colMap[jj]==col) {
           recvNonZeros[n].col = jj;//overwrite with local col id
           flag = 1;
           break;
@@ -195,23 +184,23 @@ void oasSolver_t::setup(parCSR *_A, bool nullSpace,
   }
 
   //assemble the full matrix
-  dfloat *coarseA = (dfloat *) calloc(N*N,sizeof(dfloat));
-  for (int n=0;n<A->Nrows;n++) {
-    const int start = (int) A->diag.rowStarts[n];
-    const int end   = (int) A->diag.rowStarts[n+1];
+  memory<dfloat> coarseA(N*N);
+  for (int n=0;n<A.Nrows;n++) {
+    const int start = static_cast<int>(A.diag.rowStarts[n]);
+    const int end   = static_cast<int>(A.diag.rowStarts[n+1]);
     for (int m=start;m<end;m++) {
-      int col = (int) A->diag.cols[m];
-      coarseA[n*N+col] = A->diag.vals[m];
+      int col = static_cast<int>(A.diag.cols[m]);
+      coarseA[n*N+col] = A.diag.vals[m];
     }
   }
 
-  for (int n=0;n<A->offd.nzRows;n++) {
-    const int row   = (int) A->offd.rows[n];
-    const int start = (int) A->offd.mRowStarts[n];
-    const int end   = (int) A->offd.mRowStarts[n+1];
+  for (int n=0;n<A.offd.nzRows;n++) {
+    const int row   = static_cast<int>(A.offd.rows[n]);
+    const int start = static_cast<int>(A.offd.mRowStarts[n]);
+    const int end   = static_cast<int>(A.offd.mRowStarts[n+1]);
     for (int m=start;m<end;m++) {
-      int col = (int) A->offd.cols[m];
-      coarseA[row*N+col] = A->offd.vals[m];
+      int col = static_cast<int>(A.offd.cols[m]);
+      coarseA[row*N+col] = A.offd.vals[m];
     }
   }
 
@@ -224,43 +213,36 @@ void oasSolver_t::setup(parCSR *_A, bool nullSpace,
 
   if (nullSpace) { //A is dense due to nullspace augmentation
     //copy fine nullvector and populate halo
-    dfloat *null = (dfloat *) malloc(A->Ncols*sizeof(dfloat));
-    memcpy(null, nullVector, A->Nrows*sizeof(dfloat));
-    A->halo->Exchange(null, 1, ogs_dfloat);
+    memory<dfloat> null(A.Ncols);
+    null.copyFrom(nullVector, A.Nrows);
+    A.halo.Exchange(null, 1);
 
     for (int n=0;n<N;n++) {
       for (int m=0;m<N;m++) {
         coarseA[n*N+m] += nullSpacePenalty*null[n]*null[m];
       }
     }
-
-    free(null);
   }
 
-  MPI_Barrier(comm);
-  free(recvNonZeros);
-
-  matrixInverse(N, coarseA);
+  linAlg_t::matrixInverse(N, coarseA);
 
   //determine the overlap weighting
-  dfloat *weight = (dfloat *) malloc(N*sizeof(dfloat));
-  for (int n=0;n<N;n++) weight[n] = 1.0;
+  memory<dfloat> weight(N, 1.0);
 
-  A->halo->Combine(weight, 1, ogs_dfloat);
+  A.halo.Combine(weight, 1);
 
   for (int n=0;n<N;n++) {
     for (int m=0;m<N;m++) {
       coarseA[n*N+m] *= 1.0/sqrt(weight[n]*weight[m]);
     }
   }
-  free(weight);
 
   //determine size of offd piece
-  diagTotal = A->Nrows;
-  offdTotal = A->Ncols - A->Nrows;
+  diagTotal = A.Nrows;
+  offdTotal = A.Ncols - A.Nrows;
 
   //diag piece of invA
-  diagInvAT = (dfloat *) calloc(N*diagTotal,sizeof(dfloat));
+  diagInvAT.malloc(N*diagTotal);
   for (int n=0;n<N;n++) {
     for (int m=0;m<diagTotal;m++) {
       diagInvAT[n+m*N] = coarseA[n*N+m];
@@ -268,15 +250,15 @@ void oasSolver_t::setup(parCSR *_A, bool nullSpace,
   }
 
   //offd piece of invA
-  offdInvAT = (dfloat *) calloc(N*offdTotal,sizeof(dfloat));
+  offdInvAT.malloc(N*offdTotal);
   for (int n=0;n<N;n++) {
     for (int m=0;m<offdTotal;m++) {
       offdInvAT[n+m*N] = coarseA[n*N + m+diagTotal];
     }
   }
 
-  o_diagInvAT = platform.malloc(N*diagTotal*sizeof(dfloat), diagInvAT);
-  o_offdInvAT = platform.malloc(N*offdTotal*sizeof(dfloat), offdInvAT);
+  o_diagInvAT = platform.malloc<dfloat>(diagInvAT);
+  o_offdInvAT = platform.malloc<dfloat>(offdInvAT);
 
   // if((rank==0)&&(settings.compareSetting("VERBOSE","TRUE"))) printf("done.\n");
 }
@@ -285,40 +267,36 @@ void oasSolver_t::syncToDevice() {}
 
 void oasSolver_t::Report(int lev) {
 
-  hlong hNrows = (hlong) N;
+  int totalActive = (N>0) ? 1:0;
+  comm.Allreduce(totalActive, Comm::Sum);
 
-  int active = (N>0) ? 1:0;
-  int totalActive=0;
-  MPI_Allreduce(&active, &totalActive, 1, MPI_INT, MPI_SUM, comm);
+  dlong minNrows=N, maxNrows=N;
+  hlong totalNrows=N;
+  comm.Allreduce(maxNrows, Comm::Max);
+  comm.Allreduce(totalNrows, Comm::Sum);
+  dfloat avgNrows = static_cast<dfloat>(totalNrows)/totalActive;
 
-  dlong minNrows=0, maxNrows=0;
-  hlong totalNrows=0;
-  dfloat avgNrows;
-  MPI_Allreduce(&N, &maxNrows, 1, MPI_DLONG, MPI_MAX, comm);
-  MPI_Allreduce(&hNrows, &totalNrows, 1, MPI_HLONG, MPI_SUM, comm);
-  avgNrows = (dfloat) totalNrows/totalActive;
-
-  if (N==0) N=maxNrows; //set this so it's ignored for the global min
-  MPI_Allreduce(&N, &minNrows, 1, MPI_DLONG, MPI_MIN, comm);
+  if (N==0) minNrows=maxNrows; //set this so it's ignored for the global min
+  comm.Allreduce(minNrows, Comm::Min);
 
   long long int nnz;
-  nnz = A->diag.nnz+A->offd.nnz;
+  nnz = A.diag.nnz+A.offd.nnz;
 
-  long long int minNnz=0, maxNnz=0, totalNnz=0;
-  MPI_Allreduce(&nnz, &maxNnz,   1, MPI_LONG_LONG_INT, MPI_MAX, comm);
-  MPI_Allreduce(&nnz, &totalNnz, 1, MPI_LONG_LONG_INT, MPI_SUM, comm);
+  long long int minNnz=nnz, maxNnz=nnz, totalNnz=nnz;
+  comm.Allreduce(maxNnz, Comm::Max);
+  comm.Allreduce(totalNnz, Comm::Sum);
 
-  if (nnz==0) nnz = maxNnz; //set this so it's ignored for the global min
-  MPI_Allreduce(&nnz, &minNnz, 1, MPI_LONG_LONG_INT, MPI_MIN, comm);
+  if (nnz==0) minNnz = maxNnz; //set this so it's ignored for the global min
+  comm.Allreduce(minNnz, Comm::Min);
 
-  dfloat nnzPerRow = (Nrows==0) ? 0 : (dfloat) nnz/Nrows;
-  dfloat minNnzPerRow=0, maxNnzPerRow=0, avgNnzPerRow=0;
-  MPI_Allreduce(&nnzPerRow, &maxNnzPerRow, 1, MPI_DFLOAT, MPI_MAX, comm);
-  MPI_Allreduce(&nnzPerRow, &avgNnzPerRow, 1, MPI_DFLOAT, MPI_SUM, comm);
+  dfloat nnzPerRow = (Nrows==0) ? 0 : static_cast<dfloat>(nnz)/Nrows;
+  dfloat minNnzPerRow=nnzPerRow, maxNnzPerRow=nnzPerRow, avgNnzPerRow=nnzPerRow;
+  comm.Allreduce(maxNnzPerRow, Comm::Max);
+  comm.Allreduce(avgNnzPerRow, Comm::Sum);
   avgNnzPerRow /= totalActive;
 
-  if (Nrows==0) nnzPerRow = maxNnzPerRow;
-  MPI_Allreduce(&nnzPerRow, &minNnzPerRow, 1, MPI_DFLOAT, MPI_MIN, comm);
+  if (Nrows==0) minNnzPerRow = maxNnzPerRow;
+  comm.Allreduce(minNnzPerRow, Comm::Min);
 
   std::string name = "OAS             ";
 
@@ -329,9 +307,6 @@ void oasSolver_t::Report(int lev) {
   }
 }
 
-oasSolver_t::~oasSolver_t() {
-  if (diagInvAT) free(diagInvAT);
-  if (offdInvAT) free(offdInvAT);
-}
-
 } //namespace parAlmond
+
+} //namespace libp
diff --git a/libs/parAlmond/parAlmondCoarsenLevel.cpp b/libs/parAlmond/parAlmondCoarsenLevel.cpp
index 3ed7c06f2..97120f2a7 100644
--- a/libs/parAlmond/parAlmondCoarsenLevel.cpp
+++ b/libs/parAlmond/parAlmondCoarsenLevel.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,62 +27,61 @@ SOFTWARE.
 #include "parAlmond.hpp"
 #include "parAlmond/parAlmondAMGSetup.hpp"
 
+namespace libp {
+
 namespace parAlmond {
 
 //create coarsened problem
-amgLevel *coarsenAmgLevel(amgLevel *level, dfloat *null,
-                          StrengthType strtype, dfloat theta,
-                          AggType aggtype){
+amgLevel coarsenAmgLevel(amgLevel& level, memory<dfloat>& null,
+                         StrengthType strtype, dfloat theta,
+                         AggType aggtype){
+
+  parCSR& A = level.A;
 
-  int size;
-  MPI_Comm_size(level->A->comm, &size);
+  int size = A.comm.size();
 
-  strongGraph_t *C = strongGraph(level->A, strtype, theta);
+  strongGraph_t C = strongGraph(A, strtype, theta);
 
-  hlong *FineToCoarse = (hlong *) malloc(level->A->Ncols*sizeof(hlong));
-  hlong *globalAggStarts = (hlong *) calloc(size+1,sizeof(hlong));
+  memory<hlong> FineToCoarse(A.Ncols);
+  memory<hlong> globalAggStarts(size+1);
 
-  formAggregates(level->A, C, FineToCoarse, globalAggStarts);
-  delete C;
+  formAggregates(A, C, FineToCoarse, globalAggStarts);
 
   // adjustPartition(FineToCoarse, settings);
 
-  parCSR *P;
-  parCSR *T = tentativeProlongator(level->A, FineToCoarse, globalAggStarts, null);
+  parCSR P;
+  parCSR T = tentativeProlongator(A, FineToCoarse, globalAggStarts, null);
   if (aggtype == SMOOTHED) {
-    P = smoothProlongator(level->A, T);
-    delete T;
+    P = smoothProlongator(A, T);
   } else {
     P = T;
   }
 
   // R = P^T
-  parCSR *R = transpose(P);
+  parCSR R = transpose(P);
 
-  level->P = P;
-  level->R = R;
+  level.P = P;
+  level.R = R;
 
-  parCSR *Acoarse;
+  parCSR Acoarse;
   if (aggtype == SMOOTHED) {
-    parCSR *AP = SpMM(level->A, P);
+    parCSR AP = SpMM(A, P);
     Acoarse = SpMM(R, AP);
-    delete AP;
   } else {
-    Acoarse = galerkinProd(level->A, P); //specialize for unsmoothed aggregation
+    Acoarse = galerkinProd(A, P); //specialize for unsmoothed aggregation
   }
 
-  Acoarse->diagSetup();
+  Acoarse.diagSetup();
 
-  amgLevel *coarseLevel = new amgLevel(Acoarse,level->settings);
+  amgLevel coarseLevel(Acoarse,level.settings);
 
   //update the number of columns required for this level
-  level->Ncols = (level->Ncols > R->Ncols) ? level->Ncols : R->Ncols;
+  level.Ncols = std::max(level.Ncols, std::max(A.Ncols, R.Ncols));
   // coarseLevel->Ncols = (coarseLevel->Ncols > P->Ncols) ? coarseLevel->Ncols : P->Ncols;
 
-  free(FineToCoarse);
-  free(globalAggStarts);
-
   return coarseLevel;
 }
 
 } //namespace parAlmond
+
+} //namespace libp
diff --git a/libs/parAlmond/parAlmondFormAggregates.cpp b/libs/parAlmond/parAlmondFormAggregates.cpp
index b42ec5d95..0fea3d499 100644
--- a/libs/parAlmond/parAlmondFormAggregates.cpp
+++ b/libs/parAlmond/parAlmondFormAggregates.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,9 +27,12 @@ SOFTWARE.
 #include "parAlmond.hpp"
 #include "parAlmond/parAlmondAMGSetup.hpp"
 
+namespace libp {
+
 namespace parAlmond {
 
-static bool customLess(int smax, dfloat rmax, hlong imax, int s, dfloat r, hlong i){
+static bool customLess(const int smax, const dfloat rmax, const hlong imax,
+                       const int s,    const dfloat r,    const hlong i){
 
   if(s > smax) return true;
   if(smax > s) return false;
@@ -49,46 +52,43 @@ static bool customLess(int smax, dfloat rmax, hlong imax, int s, dfloat r, hlong
 //
 /*****************************************************************************/
 
-void formAggregates(parCSR *A, strongGraph_t *C,
-                    hlong* FineToCoarse,
-                    hlong* globalAggStarts){
+void formAggregates(parCSR& A, strongGraph_t& C,
+                    memory<hlong> FineToCoarse,
+                    memory<hlong> globalAggStarts){
 
-  int rank, size;
-  MPI_Comm_rank(A->comm, &rank);
-  MPI_Comm_size(A->comm, &size);
+  int rank = A.comm.rank();
+  int size = A.comm.size();
 
-  const dlong N   = C->Nrows;
-  const dlong M   = C->Ncols;
-  const dlong nnz = C->nnz;
+  const dlong N   = C.Nrows;
+  const dlong M   = C.Ncols;
+  const dlong nnz = C.nnz;
 
-  dfloat *rands = (dfloat *) calloc(M, sizeof(dfloat));
-  int   *states = (int *)    calloc(M, sizeof(int));
-  hlong *colMap = A->colMap; //mapping from local column ids to global ids
+  memory<dfloat> rands(M);
+  memory<int>   states(M, 0);
+  memory<hlong> colMap = A.colMap; //mapping from local column ids to global ids
 
-  dfloat *Tr = (dfloat *) calloc(M, sizeof(dfloat));
-  int    *Ts = (int *)    calloc(M, sizeof(int));
-  hlong  *Ti = (hlong *)  calloc(M, sizeof(hlong));
-  hlong  *Tc = (hlong *)  calloc(M, sizeof(hlong));
+  memory<dfloat> Tr(M);
+  memory<int>    Ts(M);
+  memory<hlong>  Ti(M);
+  memory<hlong>  Tc(M);
 
   for(dlong i=0; i<N; i++)
     rands[i] = (dfloat) drand48();
 
   // add the number of non-zeros in each column
-  int *colCnt = (int *) calloc(M,sizeof(int));
+  memory<int> colCnt(M, 0);
   for(dlong i=0; i<nnz; i++)
-    colCnt[C->cols[i]]++;
+    colCnt[C.cols[i]]++;
 
   //gs for total column counts
-  A->halo->Combine(colCnt, 1, ogs_int);
+  A.halo.Combine(colCnt, 1);
 
   //add random pertubation
   for(int i=0;i<N;++i)
     rands[i] += colCnt[i];
 
-  free(colCnt);
-
   //gs to fill halo region
-  A->halo->Exchange(rands, 1, ogs_dfloat);
+  A.halo.Exchange(rands, 1);
 
   hlong done = 0;
   while(!done){
@@ -100,8 +100,8 @@ void formAggregates(parCSR *A, strongGraph_t *C,
       hlong  imax = colMap[i];
 
       if(smax != 1){
-        for(dlong jj=C->rowStarts[i];jj<C->rowStarts[i+1];jj++){
-          const dlong col = C->cols[jj];
+        for(dlong jj=C.rowStarts[i];jj<C.rowStarts[i+1];jj++){
+          const dlong col = C.cols[jj];
           if (col==i) continue;
           if(customLess(smax, rmax, imax, states[col], rands[col], colMap[col])){
             smax = states[col];
@@ -116,9 +116,9 @@ void formAggregates(parCSR *A, strongGraph_t *C,
     }
 
     //share results
-    A->halo->Exchange(Tr, 1, ogs_dfloat);
-    A->halo->Exchange(Ts, 1, ogs_int);
-    A->halo->Exchange(Ti, 1, ogs_hlong);
+    A.halo.Exchange(Tr, 1);
+    A.halo.Exchange(Ts, 1);
+    A.halo.Exchange(Ti, 1);
 
     // second neighbours
     // #pragma omp parallel for
@@ -127,8 +127,8 @@ void formAggregates(parCSR *A, strongGraph_t *C,
       dfloat rmax = Tr[i];
       hlong  imax = Ti[i];
 
-      for(dlong jj=C->rowStarts[i];jj<C->rowStarts[i+1];jj++){
-        const dlong col = C->cols[jj];
+      for(dlong jj=C.rowStarts[i];jj<C.rowStarts[i+1];jj++){
+        const dlong col = C.cols[jj];
         if (col==i) continue;
         if(customLess(smax, rmax, imax, Ts[col], Tr[col], Ti[col])){
           smax = Ts[col];
@@ -148,31 +148,28 @@ void formAggregates(parCSR *A, strongGraph_t *C,
     }
 
     //share results
-    A->halo->Exchange(states, 1, ogs_int);
+    A.halo.Exchange(states, 1);
 
     // if number of undecided nodes = 0, algorithm terminates
-    hlong cnt = 0;
-    for (dlong n=0;n<N;n++) if (states[n]==0) cnt++;
+    for (dlong n=0;n<N;n++) if (states[n]==0) done++;
 
-    MPI_Allreduce(&cnt,&done,1,MPI_HLONG, MPI_SUM,A->comm);
+    A.comm.Allreduce(done, Comm::Sum);
     done = (done == 0) ? 1 : 0;
   }
 
   dlong numAggs = 0;
-  dlong *gNumAggs = (dlong *) calloc(size,sizeof(dlong));
+  memory<dlong> gNumAggs(size);
 
   // count the coarse nodes/aggregates
   for(dlong i=0; i<N; i++)
     if(states[i] == 1) numAggs++;
 
-  MPI_Allgather(&numAggs,1,MPI_DLONG,gNumAggs,1,MPI_DLONG,A->comm);
+  A.comm.Allgather(numAggs, gNumAggs);
 
   globalAggStarts[0] = 0;
   for (int r=0;r<size;r++)
     globalAggStarts[r+1] = globalAggStarts[r] + gNumAggs[r];
 
-  free(gNumAggs);
-
   numAggs = 0;
   // enumerate the coarse nodes/aggregates
   for(dlong i=0; i<N; i++) {
@@ -184,7 +181,7 @@ void formAggregates(parCSR *A, strongGraph_t *C,
   }
 
   //share the initial aggregate flags
-  A->halo->Exchange(FineToCoarse, 1, ogs_hlong);
+  A.halo.Exchange(FineToCoarse, 1);
 
   // form the aggregates
   // #pragma omp parallel for
@@ -195,8 +192,8 @@ void formAggregates(parCSR *A, strongGraph_t *C,
     hlong  cmax = FineToCoarse[i];
 
     if(smax != 1){
-      for(dlong jj=C->rowStarts[i];jj<C->rowStarts[i+1];jj++){
-        const dlong col = C->cols[jj];
+      for(dlong jj=C.rowStarts[i];jj<C.rowStarts[i+1];jj++){
+        const dlong col = C.cols[jj];
         if (col==i) continue;
         if(customLess(smax, rmax, imax, states[col], rands[col], colMap[col])){
           smax = states[col];
@@ -216,11 +213,11 @@ void formAggregates(parCSR *A, strongGraph_t *C,
   }
 
   //share results
-  A->halo->Exchange(FineToCoarse, 1, ogs_hlong);
-  A->halo->Exchange(Tr,     1, ogs_dfloat);
-  A->halo->Exchange(Ts,     1, ogs_int);
-  A->halo->Exchange(Ti,     1, ogs_hlong);
-  A->halo->Exchange(Tc,     1, ogs_hlong);
+  A.halo.Exchange(FineToCoarse, 1);
+  A.halo.Exchange(Tr, 1);
+  A.halo.Exchange(Ts, 1);
+  A.halo.Exchange(Ti, 1);
+  A.halo.Exchange(Tc, 1);
 
   // second neighbours
   // #pragma omp parallel for
@@ -230,8 +227,8 @@ void formAggregates(parCSR *A, strongGraph_t *C,
     hlong  imax = Ti[i];
     hlong  cmax = Tc[i];
 
-    for(dlong jj=C->rowStarts[i];jj<C->rowStarts[i+1];jj++){
-      const dlong col = C->cols[jj];
+    for(dlong jj=C.rowStarts[i];jj<C.rowStarts[i+1];jj++){
+      const dlong col = C.cols[jj];
       if (col==i) continue;
       if(customLess(smax, rmax, imax, Ts[col], Tr[col], Ti[col])){
         smax = Ts[col];
@@ -246,14 +243,9 @@ void formAggregates(parCSR *A, strongGraph_t *C,
   }
 
   //share results
-  A->halo->Exchange(FineToCoarse, 1, ogs_hlong);
-
-  free(rands);
-  free(states);
-  free(Tr);
-  free(Ts);
-  free(Ti);
-  free(Tc);
+  A.halo.Exchange(FineToCoarse, 1);
 }
 
 } //namespace parAlmond
+
+} //namespace libp
diff --git a/libs/parAlmond/parAlmondGalerkinProd.cpp b/libs/parAlmond/parAlmondGalerkinProd.cpp
index 03429d38b..2d21207ed 100644
--- a/libs/parAlmond/parAlmondGalerkinProd.cpp
+++ b/libs/parAlmond/parAlmondGalerkinProd.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,16 +27,17 @@ SOFTWARE.
 #include "parAlmond.hpp"
 #include "parAlmond/parAlmondAMGSetup.hpp"
 
+namespace libp {
+
 namespace parAlmond {
 
-parCSR *galerkinProd(parCSR *A, parCSR *P){
+parCSR galerkinProd(parCSR& A, parCSR& P){
 
   // MPI info
-  int rank, size;
-  MPI_Comm_rank(A->comm, &rank);
-  MPI_Comm_size(A->comm, &size);
+  int rank = A.comm.rank();
+  int size = A.comm.size();
 
-  hlong *globalAggStarts = P->globalColStarts;
+  memory<hlong> globalAggStarts = P.globalColStarts;
   hlong globalAggOffset = globalAggStarts[rank];
 
   //The galerkin product can be computed as
@@ -44,47 +45,47 @@ parCSR *galerkinProd(parCSR *A, parCSR *P){
   // Since each row of P has only one entry, we can share the necessary
   // P entries, form the products, and send them to their destination rank
 
-  const dlong N = A->Nrows;
-  const dlong M = A->Ncols;
+  const dlong N = A.Nrows;
+  const dlong M = A.Ncols;
 
   //printf("Level has %d rows, and is making %d aggregates\n", N, globalAggStarts[rank+1]-globalAggStarts[rank]);
 
   // Exploit the fact that we know P has one non-zero per row to
   // compress the global Ids of the columns and nonzero values to
   // single vectors
-  hlong  *Pcols = (hlong  *) calloc(M,sizeof(hlong));
-  pfloat *Pvals = (pfloat *) calloc(M,sizeof(pfloat));
+  memory<hlong>  Pcols(M);
+  memory<pfloat> Pvals(M);
 
   //record the entries of P that this rank has
   for (dlong i=0;i<N;i++) {
-    for (dlong j=P->diag.rowStarts[i];j<P->diag.rowStarts[i+1];j++) {
-      Pcols[i] = P->diag.cols[j] + globalAggOffset; //global ID
-      Pvals[i] = P->diag.vals[j];
+    for (dlong j=P.diag.rowStarts[i];j<P.diag.rowStarts[i+1];j++) {
+      Pcols[i] = P.diag.cols[j] + globalAggOffset; //global ID
+      Pvals[i] = P.diag.vals[j];
     }
   }
-  for (dlong i=0;i<P->offd.nzRows;i++) {
-    const dlong row = P->offd.rows[i];
-    for (dlong j=P->offd.mRowStarts[i];j<P->offd.mRowStarts[i+1];j++) {
-      Pcols[row] = P->colMap[P->offd.cols[j]]; //global ID
-      Pvals[row] = P->offd.vals[j];
+  for (dlong i=0;i<P.offd.nzRows;i++) {
+    const dlong row = P.offd.rows[i];
+    for (dlong j=P.offd.mRowStarts[i];j<P.offd.mRowStarts[i+1];j++) {
+      Pcols[row] = P.colMap[P.offd.cols[j]]; //global ID
+      Pvals[row] = P.offd.vals[j];
     }
   }
 
   //fill the halo region
-  A->halo->Exchange(Pcols, 1, ogs_hlong);
-  A->halo->Exchange(Pvals, 1, ogs_pfloat);
+  A.halo.Exchange(Pcols, 1);
+  A.halo.Exchange(Pvals, 1);
 
-  dlong sendNtotal = A->diag.nnz+A->offd.nnz;
-  parCOO::nonZero_t *sendPTAP = (parCOO::nonZero_t *) calloc(sendNtotal,sizeof(parCOO::nonZero_t));
+  dlong sendNtotal = A.diag.nnz+A.offd.nnz;
+  memory<parCOO::nonZero_t> sendPTAP(sendNtotal);
 
   //form the fine PTAP products
   dlong cnt =0;
   for (dlong i=0;i<N;i++) {
-    const dlong start = A->diag.rowStarts[i];
-    const dlong end   = A->diag.rowStarts[i+1];
+    const dlong start = A.diag.rowStarts[i];
+    const dlong end   = A.diag.rowStarts[i+1];
     for (dlong j=start;j<end;j++) {
-      const dlong  col = A->diag.cols[j];
-      const dfloat val = A->diag.vals[j];
+      const dlong  col = A.diag.cols[j];
+      const dfloat val = A.diag.vals[j];
 
       sendPTAP[cnt].row = Pcols[i];
       sendPTAP[cnt].col = Pcols[col];
@@ -92,13 +93,13 @@ parCSR *galerkinProd(parCSR *A, parCSR *P){
       cnt++;
     }
   }
-  for (dlong i=0;i<A->offd.nzRows;i++) {
-    const dlong row   = A->offd.rows[i];
-    const dlong start = A->offd.mRowStarts[i];
-    const dlong end   = A->offd.mRowStarts[i+1];
+  for (dlong i=0;i<A.offd.nzRows;i++) {
+    const dlong row   = A.offd.rows[i];
+    const dlong start = A.offd.mRowStarts[i];
+    const dlong end   = A.offd.mRowStarts[i+1];
     for (dlong j=start;j<end;j++) {
-      const dlong  col = A->offd.cols[j];
-      const dfloat val = A->offd.vals[j];
+      const dlong  col = A.offd.cols[j];
+      const dfloat val = A.offd.vals[j];
 
       sendPTAP[cnt].row = Pcols[row];
       sendPTAP[cnt].col = Pcols[col];
@@ -107,11 +108,8 @@ parCSR *galerkinProd(parCSR *A, parCSR *P){
     }
   }
 
-  free(Pcols);
-  free(Pvals);
-
   //sort entries by the coarse row and col
-  std::sort(sendPTAP, sendPTAP+sendNtotal,
+  std::sort(sendPTAP.ptr(), sendPTAP.ptr()+sendNtotal,
             [](const parCOO::nonZero_t& a, const parCOO::nonZero_t& b) {
               if (a.row < b.row) return true;
               if (a.row > b.row) return false;
@@ -120,10 +118,10 @@ parCSR *galerkinProd(parCSR *A, parCSR *P){
             });
 
   //count number of non-zeros we're sending
-  int *sendCounts = (int *) calloc(size,sizeof(int));
-  int *recvCounts = (int *) calloc(size,sizeof(int));
-  int *sendOffsets = (int *) calloc(size+1,sizeof(int));
-  int *recvOffsets = (int *) calloc(size+1,sizeof(int));
+  memory<int> sendCounts(size,0);
+  memory<int> recvCounts(size);
+  memory<int> sendOffsets(size+1);
+  memory<int> recvOffsets(size+1);
 
   int r=0;
   for(dlong i=0;i<sendNtotal;++i) {
@@ -133,30 +131,24 @@ parCSR *galerkinProd(parCSR *A, parCSR *P){
   }
 
   // find how many nodes to expect (should use sparse version)
-  MPI_Alltoall(sendCounts, 1, MPI_INT,
-               recvCounts, 1, MPI_INT, A->comm);
+  A.comm.Alltoall(sendCounts, recvCounts);
 
   // find send and recv offsets for gather
+  sendOffsets[0] = 0;
+  recvOffsets[0] = 0;
   for(int rr=0;rr<size;++rr){
     sendOffsets[rr+1] = sendOffsets[rr] + sendCounts[rr];
     recvOffsets[rr+1] = recvOffsets[rr] + recvCounts[rr];
   }
   dlong recvNtotal = recvOffsets[size];
 
-  parCOO::nonZero_t *recvPTAP = (parCOO::nonZero_t *) calloc(recvNtotal,sizeof(parCOO::nonZero_t));
-
-  MPI_Alltoallv(sendPTAP, sendCounts, sendOffsets, MPI_NONZERO_T,
-                recvPTAP, recvCounts, recvOffsets, MPI_NONZERO_T,
-                A->comm);
+  memory<parCOO::nonZero_t> recvPTAP(recvNtotal);
 
-  //clean up
-  MPI_Barrier(A->comm);
-  free(sendPTAP);
-  free(sendCounts); free(recvCounts);
-  free(sendOffsets); free(recvOffsets);
+  A.comm.Alltoallv(sendPTAP, sendCounts, sendOffsets,
+                   recvPTAP, recvCounts, recvOffsets);
 
   //sort entries by the coarse row and col
-  std::sort(recvPTAP, recvPTAP+recvNtotal,
+  std::sort(recvPTAP.ptr(), recvPTAP.ptr()+recvNtotal,
             [](const parCOO::nonZero_t& a, const parCOO::nonZero_t& b) {
               if (a.row < b.row) return true;
               if (a.row > b.row) return false;
@@ -172,16 +164,14 @@ parCSR *galerkinProd(parCSR *A, parCSR *P){
         (recvPTAP[i].col!=recvPTAP[i-1].col)) nnz++;
 
 
-  parCOO PTAP(A->platform, A->comm);
+  parCOO PTAP(A.platform, A.comm);
 
   //copy global partition
-  PTAP.globalRowStarts = (hlong *) calloc(size+1,sizeof(hlong));
-  PTAP.globalColStarts = (hlong *) calloc(size+1,sizeof(hlong));
-  memcpy(PTAP.globalRowStarts, globalAggStarts, (size+1)*sizeof(hlong));
-  memcpy(PTAP.globalColStarts, globalAggStarts, (size+1)*sizeof(hlong));
+  PTAP.globalRowStarts = globalAggStarts;
+  PTAP.globalColStarts = globalAggStarts;
 
   PTAP.nnz = nnz;
-  PTAP.entries = (parCOO::nonZero_t *) malloc(PTAP.nnz*sizeof(parCOO::nonZero_t));
+  PTAP.entries.malloc(PTAP.nnz);
 
   //compress nonzeros
   nnz = 0;
@@ -195,12 +185,10 @@ parCSR *galerkinProd(parCSR *A, parCSR *P){
     }
   }
 
-  //clean up
-  MPI_Barrier(A->comm);
-  free(recvPTAP);
-
   //build Ac from coo matrix
-  return new parCSR(PTAP);
+  return parCSR(PTAP);
 }
 
 } //namespace parAlmond
+
+} //namespace libp
diff --git a/libs/parAlmond/parAlmondKcycle.cpp b/libs/parAlmond/parAlmondKcycle.cpp
index 4b5bf5d6f..4f5eb4d20 100644
--- a/libs/parAlmond/parAlmondKcycle.cpp
+++ b/libs/parAlmond/parAlmondKcycle.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,11 +26,13 @@ SOFTWARE.
 
 #include "parAlmond.hpp"
 #include "parAlmond/parAlmondKernels.hpp"
-#include "parAlmond/parAlmondMultigrid.hpp"
+#include "parAlmond/parAlmondCoarseSolver.hpp"
+
+namespace libp {
 
 namespace parAlmond {
 
-void multigrid_t::kcycle(const int k, occa::memory& o_RHS, occa::memory& o_X){
+void multigrid_t::kcycle(const int k, deviceMemory<dfloat>& o_RHS, deviceMemory<dfloat>& o_X){
 
   //check for base level
   if(k==baseLevel) {
@@ -38,21 +40,21 @@ void multigrid_t::kcycle(const int k, occa::memory& o_RHS, occa::memory& o_X){
     return;
   }
 
-  multigridLevel *level  = levels[k];
-  multigridLevel *levelC = levels[k+1];
-  occa::memory& o_RHSC = o_rhs[k+1];
-  occa::memory& o_XC   = o_x[k+1];
-  occa::memory& o_RES   = o_scratch;
+  multigridLevel& level  = *levels[k];
+  multigridLevel& levelC = *levels[k+1];
+  deviceMemory<dfloat>& o_RHSC = o_rhs[k+1];
+  deviceMemory<dfloat>& o_XC   = o_x[k+1];
+  deviceMemory<dfloat>& o_RES  = o_scratch;
 
-  const dlong mCoarse = levelC->Nrows;
+  const dlong mCoarse = levelC.Nrows;
 
   //apply smoother to x and then compute res = rhs-Ax
-  level->smooth(o_RHS, o_X, true);
+  level.smooth(o_RHS, o_X, true);
 
-  level->residual(o_RHS, o_X, o_RES);
+  level.residual(o_RHS, o_X, o_RES);
 
   // rhsC = P^T res
-  level->coarsen(o_RES, o_RHSC);
+  level.coarsen(o_RES, o_RHSC);
 
   if(k+1>NUMKCYCLES) {
     vcycle(k+1, o_RHSC, o_XC);
@@ -60,9 +62,9 @@ void multigrid_t::kcycle(const int k, occa::memory& o_RHS, occa::memory& o_X){
     // first inner krylov iteration
     kcycle(k+1, o_RHSC, o_XC);
 
-    occa::memory& o_CK   = o_ck[k+1];
-    occa::memory& o_VK   = o_vk[k+1];
-    occa::memory& o_WK   = o_wk[k+1];
+    deviceMemory<dfloat>& o_CK   = o_ck[k+1];
+    deviceMemory<dfloat>& o_VK   = o_vk[k+1];
+    deviceMemory<dfloat>& o_WK   = o_wk[k+1];
 
     // ck = xC, vk = A*ck
     // alpha1=ck*rhsC, rho1=ck*Ack, norm_rhs=sqrt(rhsC*rhsC)
@@ -70,11 +72,11 @@ void multigrid_t::kcycle(const int k, occa::memory& o_RHS, occa::memory& o_X){
     // norm_rtilde = sqrt(rhsC*rhsC)
     dfloat rho1, alpha1, norm_rhs, norm_rhstilde;
     kcycleOp1(levelC, o_XC, o_RHSC, o_CK, o_VK,
-              &alpha1, &rho1, &norm_rhs, &norm_rhstilde);
+              alpha1, rho1, norm_rhs, norm_rhstilde);
 
     if(norm_rhstilde < KCYCLETOL*norm_rhs){
       // xC = (alpha1/rho1)*xC
-      platform.linAlg.scale(mCoarse, alpha1/rho1, o_XC);
+      platform.linAlg().scale(mCoarse, alpha1/rho1, o_XC);
     } else{
 
       // second inner krylov iteration
@@ -89,135 +91,159 @@ void multigrid_t::kcycle(const int k, occa::memory& o_RHS, occa::memory& o_X){
   }
 
   // x = x + P xC
-  level->prolongate(o_XC, o_X);
+  level.prolongate(o_XC, o_X);
 
-  level->smooth(o_RHS, o_X, false);
+  level.smooth(o_RHS, o_X, false);
 }
 
 
-void multigrid_t::kcycleOp1(multigridLevel* level,
-                           occa::memory& o_X,  occa::memory& o_RHS,
-                           occa::memory& o_CK, occa::memory& o_VK,
-                           dfloat *alpha1, dfloat *rho1,
-                           dfloat *norm_rhs, dfloat *norm_rhstilde) {
+void multigrid_t::kcycleOp1(multigridLevel& level,
+                           deviceMemory<dfloat>& o_X,  deviceMemory<dfloat>& o_RHS,
+                           deviceMemory<dfloat>& o_CK, deviceMemory<dfloat>& o_VK,
+                           dfloat& alpha1, dfloat& rho1,
+                           dfloat& norm_rhs, dfloat& norm_rhstilde) {
 
   //ck = x
-  platform.linAlg.axpy(level->Nrows, 1.0, o_X, 0.0, o_CK);
+  platform.linAlg().axpy(level.Nrows, 1.0, o_X, 0.0, o_CK);
 
   // vk = A*ck
-  level->Operator(o_CK,o_VK);
+  level.Operator(o_CK,o_VK);
 
   // alpha1=ck*rhsC, rho1=ck*Ack, norm_rhs=sqrt(rhsC*rhsC)
-  dfloat rho[3];
   if(ktype == PCG)
-    kcycleCombinedOp1(level, rho, o_CK, o_RHS, o_VK);
+    kcycleCombinedOp1(level, o_CK, o_RHS, o_VK, alpha1, rho1, norm_rhs);
 
   if(ktype == GMRES)
-    kcycleCombinedOp1(level, rho, o_VK, o_RHS, o_VK);
+    kcycleCombinedOp1(level, o_VK, o_RHS, o_VK, alpha1, rho1, norm_rhs);
 
-  *alpha1 = rho[0];
-  *rho1   = rho[1];
-  *norm_rhs = sqrt(rho[2]);
+  norm_rhs = sqrt(norm_rhs);
 
   // rhs = rhs - (alpha1/rho1)*vk
-  const dfloat a = -(*alpha1)/(*rho1);
-  *norm_rhstilde = sqrt(vectorAddInnerProd(level, a, o_VK, 1.0, o_RHS));
+  const dfloat a = -(alpha1)/(rho1);
+  norm_rhstilde = sqrt(vectorAddInnerProd(level, a, o_VK, 1.0, o_RHS));
 }
 
-void multigrid_t::kcycleOp2(multigridLevel* level,
-                            occa::memory& o_X,  occa::memory& o_RHS,
-                            occa::memory& o_CK, occa::memory& o_VK, occa::memory& o_WK,
+void multigrid_t::kcycleOp2(multigridLevel& level,
+                            deviceMemory<dfloat>& o_X,  deviceMemory<dfloat>& o_RHS,
+                            deviceMemory<dfloat>& o_CK, deviceMemory<dfloat>& o_VK, deviceMemory<dfloat>& o_WK,
                             const dfloat alpha1, const dfloat rho1) {
 
-  if(fabs(rho1) > (dfloat) 1e-20){
+  if(std::abs(rho1) > (dfloat) 1e-20){
     // wk = A*x
-    level->Operator(o_X,o_WK);
+    level.Operator(o_X,o_WK);
 
     // gamma=xC*Ack, beta=xC*AxC, alpha2=xC*rhsC
-    dfloat rho[3];
+    dfloat gamma, beta, alpha2;
 
     if(ktype == PCG)
-      kcycleCombinedOp2(level, rho, o_X, o_VK, o_WK, o_RHS);
+      kcycleCombinedOp2(level, o_X, o_VK, o_WK, o_RHS, gamma, beta, alpha2);
 
     if(ktype == GMRES)
-      kcycleCombinedOp2(level, rho, o_WK, o_VK, o_WK, o_RHS);
-
-    const dfloat gamma  = rho[0];
-    const dfloat beta   = rho[1];
-    const dfloat alpha2 = rho[2];
-
+      kcycleCombinedOp2(level, o_WK, o_VK, o_WK, o_RHS, gamma, beta, alpha2);
 
     const dfloat rho2 = beta - gamma*gamma/rho1;
 
-    if(fabs(rho2) > (dfloat) 1e-20){
+    if(std::abs(rho2) > (dfloat) 1e-20){
       // x = (alpha1/rho1 - (gam*alpha2)/(rho1*rho2))*ck + (alpha2/rho2)*x
       const dfloat a = alpha1/rho1 - gamma*alpha2/(rho1*rho2);
       const dfloat b = alpha2/rho2;
 
-      platform.linAlg.axpy(level->Nrows, a, o_CK, b, o_X);
+      platform.linAlg().axpy(level.Nrows, a, o_CK, b, o_X);
     }
   }
 }
 
 // returns aDotbc[0] = a\dot b, aDotbc[1] = a\dot c, aDotbc[2] = b\dot b,
-void multigrid_t::kcycleCombinedOp1(multigridLevel* level,
-                                    dfloat *aDotbc, occa::memory& o_a,
-                                    occa::memory& o_b, occa::memory& o_c) {
+void multigrid_t::kcycleCombinedOp1(multigridLevel& level,
+                                    deviceMemory<dfloat>& o_a,
+                                    deviceMemory<dfloat>& o_b,
+                                    deviceMemory<dfloat>& o_c,
+                                    dfloat& aDotb,
+                                    dfloat& aDotc,
+                                    dfloat& bDotb) {
 
-  const dlong N = level->Nrows;
-  dfloat result[3] = {0.,0.,0.};
-  dlong numBlocks = (N < PARALMOND_NBLOCKS) ? N : PARALMOND_NBLOCKS;
+  const dlong N = level.Nrows;
+  dlong numBlocks = std::min(N, PARALMOND_NBLOCKS);
 
   kcycleCombinedOp1Kernel(numBlocks,N,o_a,o_b,o_c,o_reductionScratch);
-  o_reductionScratch.copyTo(reductionScratch,3*numBlocks*sizeof(dfloat),0);
 
-  for(dlong i=0; i<numBlocks; i++) {
-    result[0] += ((dfloat*)reductionScratch)[3*i+0];
-    result[1] += ((dfloat*)reductionScratch)[3*i+1];
-    result[2] += ((dfloat*)reductionScratch)[3*i+2];
+  if (numBlocks>0) {
+    reductionScratch.copyFrom(o_reductionScratch,3*numBlocks);
+  } else {
+    reductionScratch[0] = 0.0;
+    reductionScratch[1] = 0.0;
+    reductionScratch[2] = 0.0;
   }
-  MPI_Allreduce(result,aDotbc,3,MPI_DFLOAT,MPI_SUM,comm);
+
+  for(dlong i=1; i<numBlocks; i++) {
+    reductionScratch[0] += reductionScratch[3*i+0];
+    reductionScratch[1] += reductionScratch[3*i+1];
+    reductionScratch[2] += reductionScratch[3*i+2];
+  }
+  comm.Allreduce(reductionScratch, Comm::Sum, 3);
+  aDotb = reductionScratch[0];
+  aDotc = reductionScratch[1];
+  bDotb = reductionScratch[2];
 }
 
 // returns aDotbcd[0] = a\dot b, aDotbcd[1] = a\dot c, aDotbcd[2] = a\dot d,
-void multigrid_t::kcycleCombinedOp2(multigridLevel* level, dfloat *aDotbcd,
-                                    occa::memory& o_a, occa::memory& o_b,
-                                    occa::memory& o_c, occa::memory& o_d) {
-
-  const dlong N = level->Nrows;
-  dfloat result[3] = {0.,0.,0.};
-  dlong numBlocks = (N < PARALMOND_NBLOCKS) ? N : PARALMOND_NBLOCKS;
+void multigrid_t::kcycleCombinedOp2(multigridLevel& level,
+                                    deviceMemory<dfloat>& o_a,
+                                    deviceMemory<dfloat>& o_b,
+                                    deviceMemory<dfloat>& o_c,
+                                    deviceMemory<dfloat>& o_d,
+                                    dfloat& aDotb,
+                                    dfloat& aDotc,
+                                    dfloat& aDotd) {
+
+  const dlong N = level.Nrows;
+  dlong numBlocks = std::min(N, PARALMOND_NBLOCKS);
 
   kcycleCombinedOp2Kernel(numBlocks,N,o_a,o_b,o_c,o_d,o_reductionScratch);
-  o_reductionScratch.copyTo(reductionScratch,3*numBlocks*sizeof(dfloat),0);
 
-  for(dlong i=0; i<numBlocks; i++) {
-    result[0] += ((dfloat*)reductionScratch)[3*i+0];
-    result[1] += ((dfloat*)reductionScratch)[3*i+1];
-    result[2] += ((dfloat*)reductionScratch)[3*i+2];
+  if (numBlocks>0) {
+    reductionScratch.copyFrom(o_reductionScratch,3*numBlocks);
+  } else {
+    reductionScratch[0] = 0.0;
+    reductionScratch[1] = 0.0;
+    reductionScratch[2] = 0.0;
   }
-  MPI_Allreduce(result,aDotbcd,3,MPI_DFLOAT,MPI_SUM,comm);
+
+  for(dlong i=1; i<numBlocks; i++) {
+    reductionScratch[0] += reductionScratch[3*i+0];
+    reductionScratch[1] += reductionScratch[3*i+1];
+    reductionScratch[2] += reductionScratch[3*i+2];
+  }
+  comm.Allreduce(reductionScratch, Comm::Sum, 3);
+  aDotb = reductionScratch[0];
+  aDotc = reductionScratch[1];
+  aDotd = reductionScratch[2];
 }
 
 // y = beta*y + alpha*x, and return y\dot y
-dfloat multigrid_t::vectorAddInnerProd(multigridLevel* level,
-                                      const dfloat alpha, occa::memory& o_X,
-                                      const dfloat beta,  occa::memory& o_Y){
+dfloat multigrid_t::vectorAddInnerProd(multigridLevel& level,
+                                      const dfloat alpha, deviceMemory<dfloat>& o_X,
+                                      const dfloat beta,  deviceMemory<dfloat>& o_Y){
 
-  const dlong N = level->Nrows;
-  dfloat result = 0.;
-  dfloat gresult = 0.;
-  dlong numBlocks = (N < PARALMOND_NBLOCKS) ? N : PARALMOND_NBLOCKS;
+  const dlong N = level.Nrows;
+  dlong numBlocks = std::min(N, PARALMOND_NBLOCKS);
 
   vectorAddInnerProdKernel(numBlocks,N,alpha,beta,o_X,o_Y,o_reductionScratch);
-  o_reductionScratch.copyTo(reductionScratch,numBlocks*sizeof(dfloat),0);
+
+  if (numBlocks>0) {
+    reductionScratch.copyFrom(o_reductionScratch,numBlocks);
+  } else {
+    reductionScratch[0] = 0.0;
+  }
 
   // #pragma omp parallel for reduction(+:result)
-  for (dlong i=0; i<numBlocks; i++) {
-    result += ((dfloat*)reductionScratch)[i];
+  for (dlong i=1; i<numBlocks; i++) {
+    reductionScratch[0] += reductionScratch[i];
   }
-  MPI_Allreduce(&result,&gresult,1,MPI_DFLOAT,MPI_SUM,comm);
-  return gresult;
+  comm.Allreduce(reductionScratch, Comm::Sum, 1);
+  return reductionScratch[0];
 }
 
 } //namespace parAlmond
+
+} //namespace libp
diff --git a/libs/parAlmond/parAlmondKernels.cpp b/libs/parAlmond/parAlmondKernels.cpp
index f71e68d75..81e58113a 100644
--- a/libs/parAlmond/parAlmondKernels.cpp
+++ b/libs/parAlmond/parAlmondKernels.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,96 +25,69 @@ SOFTWARE.
 */
 
 #include "parAlmond.hpp"
+#include "parAlmond/parAlmondKernels.hpp"
 
-namespace parAlmond {
-
-int Nrefs = 0;
-
-//NC: Hard code these for now. Should be sufficient for GPU devices, but needs attention for CPU
-const int blockSize = 256;
-int NonzerosPerBlock = 2048; //should be a multiple of blockSize for good unrolling
+namespace libp {
 
-occa::kernel SpMVcsrKernel1;
-occa::kernel SpMVcsrKernel2;
-occa::kernel SpMVmcsrKernel;
+namespace parAlmond {
 
-occa::kernel SmoothJacobiCSRKernel;
-occa::kernel SmoothJacobiMCSRKernel;
+kernel_t SpMVcsrKernel1;
+kernel_t SpMVcsrKernel2;
+kernel_t SpMVmcsrKernel;
 
-occa::kernel SmoothChebyshevStartKernel;
-occa::kernel SmoothChebyshevCSRKernel;
-occa::kernel SmoothChebyshevMCSRKernel;
-occa::kernel SmoothChebyshevUpdateKernel;
+kernel_t SmoothJacobiCSRKernel;
+kernel_t SmoothJacobiMCSRKernel;
 
-occa::kernel kcycleCombinedOp1Kernel;
-occa::kernel kcycleCombinedOp2Kernel;
-occa::kernel vectorAddInnerProdKernel;
+kernel_t SmoothChebyshevStartKernel;
+kernel_t SmoothChebyshevCSRKernel;
+kernel_t SmoothChebyshevMCSRKernel;
+kernel_t SmoothChebyshevUpdateKernel;
 
-occa::kernel dGEMVKernel;
+kernel_t kcycleCombinedOp1Kernel;
+kernel_t kcycleCombinedOp2Kernel;
+kernel_t vectorAddInnerProdKernel;
 
-MPI_Datatype MPI_NONZERO_T;
+kernel_t dGEMVKernel;
 
 void buildParAlmondKernels(platform_t& platform){
 
-  // Make the MPI_NONZERO_T data type
-  parCOO::nonZero_t NZ;
-  MPI_Datatype dtype[3] = {MPI_HLONG, MPI_HLONG, MPI_DFLOAT};
-  int blength[3] = {1, 1, 1};
-  MPI_Aint addr[3], displ[3];
-  MPI_Get_address ( &(NZ.row), addr+0);
-  MPI_Get_address ( &(NZ.col), addr+1);
-  MPI_Get_address ( &(NZ.val), addr+2);
-  displ[0] = 0;
-  displ[1] = addr[1] - addr[0];
-  displ[2] = addr[2] - addr[0];
-  MPI_Type_create_struct (3, blength, displ, dtype, &MPI_NONZERO_T);
-  MPI_Type_commit (&MPI_NONZERO_T);
-
-  //seed rng
-  int rank=platform.rank;
-  double seed = (double) rank;
-  srand48(seed);
+  if (SpMVcsrKernel1.isInitialized()==false) {
+    //seed rng
+    int rank=platform.rank();
+    double seed = (double) rank;
+    srand48(seed);
 
-  //build kernels
-  occa::properties kernelInfo = platform.props;
+    //build kernels
+    properties_t kernelInfo = platform.props();
 
-  kernelInfo["defines/" "p_BLOCKSIZE"]= blockSize;
-  kernelInfo["defines/" "p_NonzerosPerBlock"]= NonzerosPerBlock;
+    kernelInfo["defines/" "p_BLOCKSIZE"]= blockSize;
+    kernelInfo["defines/" "p_NonzerosPerBlock"]= NonzerosPerBlock;
 
-  if (rank==0) {printf("Compiling parALMOND Kernels...");fflush(stdout);}
+    if (rank==0) {printf("Compiling parALMOND Kernels...");fflush(stdout);}
 
-  SpMVcsrKernel1  = platform.buildKernel(PARALMOND_DIR"/okl/SpMVcsr.okl",  "SpMVcsr1",  kernelInfo);
-  SpMVcsrKernel2  = platform.buildKernel(PARALMOND_DIR"/okl/SpMVcsr.okl",  "SpMVcsr2",  kernelInfo);
-  SpMVmcsrKernel  = platform.buildKernel(PARALMOND_DIR"/okl/SpMVmcsr.okl", "SpMVmcsr1", kernelInfo);
+    SpMVcsrKernel1  = platform.buildKernel(PARALMOND_DIR"/okl/SpMVcsr.okl",  "SpMVcsr1",  kernelInfo);
+    SpMVcsrKernel2  = platform.buildKernel(PARALMOND_DIR"/okl/SpMVcsr.okl",  "SpMVcsr2",  kernelInfo);
+    SpMVmcsrKernel  = platform.buildKernel(PARALMOND_DIR"/okl/SpMVmcsr.okl", "SpMVmcsr1", kernelInfo);
 
-  SmoothJacobiCSRKernel  = platform.buildKernel(PARALMOND_DIR"/okl/SmoothJacobi.okl", "SmoothJacobiCSR", kernelInfo);
-  SmoothJacobiMCSRKernel = platform.buildKernel(PARALMOND_DIR"/okl/SmoothJacobi.okl", "SmoothJacobiMCSR", kernelInfo);
+    SmoothJacobiCSRKernel  = platform.buildKernel(PARALMOND_DIR"/okl/SmoothJacobi.okl", "SmoothJacobiCSR", kernelInfo);
+    SmoothJacobiMCSRKernel = platform.buildKernel(PARALMOND_DIR"/okl/SmoothJacobi.okl", "SmoothJacobiMCSR", kernelInfo);
 
-  SmoothChebyshevStartKernel = platform.buildKernel(PARALMOND_DIR"/okl/SmoothChebyshev.okl", "SmoothChebyshevStart", kernelInfo);
-  SmoothChebyshevCSRKernel  = platform.buildKernel(PARALMOND_DIR"/okl/SmoothChebyshev.okl", "SmoothChebyshevCSR", kernelInfo);
-  SmoothChebyshevMCSRKernel = platform.buildKernel(PARALMOND_DIR"/okl/SmoothChebyshev.okl", "SmoothChebyshevMCSR", kernelInfo);
-  SmoothChebyshevUpdateKernel = platform.buildKernel(PARALMOND_DIR"/okl/SmoothChebyshev.okl", "SmoothChebyshevUpdate", kernelInfo);
+    SmoothChebyshevStartKernel = platform.buildKernel(PARALMOND_DIR"/okl/SmoothChebyshev.okl", "SmoothChebyshevStart", kernelInfo);
+    SmoothChebyshevCSRKernel  = platform.buildKernel(PARALMOND_DIR"/okl/SmoothChebyshev.okl", "SmoothChebyshevCSR", kernelInfo);
+    SmoothChebyshevMCSRKernel = platform.buildKernel(PARALMOND_DIR"/okl/SmoothChebyshev.okl", "SmoothChebyshevMCSR", kernelInfo);
+    SmoothChebyshevUpdateKernel = platform.buildKernel(PARALMOND_DIR"/okl/SmoothChebyshev.okl", "SmoothChebyshevUpdate", kernelInfo);
 
-  vectorAddInnerProdKernel = platform.buildKernel(PARALMOND_DIR"/okl/vectorAddInnerProd.okl", "vectorAddInnerProd", kernelInfo);
+    vectorAddInnerProdKernel = platform.buildKernel(PARALMOND_DIR"/okl/vectorAddInnerProd.okl", "vectorAddInnerProd", kernelInfo);
 
-  kcycleCombinedOp1Kernel = platform.buildKernel(PARALMOND_DIR"/okl/kcycleCombinedOp.okl", "kcycleCombinedOp1", kernelInfo);
-  kcycleCombinedOp2Kernel = platform.buildKernel(PARALMOND_DIR"/okl/kcycleCombinedOp.okl", "kcycleCombinedOp2", kernelInfo);
+    kcycleCombinedOp1Kernel = platform.buildKernel(PARALMOND_DIR"/okl/kcycleCombinedOp.okl", "kcycleCombinedOp1", kernelInfo);
+    kcycleCombinedOp2Kernel = platform.buildKernel(PARALMOND_DIR"/okl/kcycleCombinedOp.okl", "kcycleCombinedOp2", kernelInfo);
 
-  dGEMVKernel = platform.buildKernel(PARALMOND_DIR"/okl/dGEMV.okl", "dGEMV", kernelInfo);
+    dGEMVKernel = platform.buildKernel(PARALMOND_DIR"/okl/dGEMV.okl", "dGEMV", kernelInfo);
 
-  if(rank==0) printf("done.\n");
+    if(rank==0) printf("done.\n");
+  }
 }
 
-void freeParAlmondKernels() {
-
-  SpMVcsrKernel1.free();
-  SpMVcsrKernel2.free();
-  SpMVmcsrKernel.free();
-
-  kcycleCombinedOp1Kernel.free();
-  kcycleCombinedOp2Kernel.free();
-  vectorAddInnerProdKernel.free();
-}
-
-
 } //namespace parAlmond
+
+} //namespace libp
diff --git a/libs/parAlmond/parAlmondMultigrid.cpp b/libs/parAlmond/parAlmondMultigrid.cpp
index 46ec3a0c8..22519fad9 100644
--- a/libs/parAlmond/parAlmondMultigrid.cpp
+++ b/libs/parAlmond/parAlmondMultigrid.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,12 +26,13 @@ SOFTWARE.
 
 #include "parAlmond.hpp"
 #include "parAlmond/parAlmondDefines.hpp"
-#include "parAlmond/parAlmondMultigrid.hpp"
 #include "parAlmond/parAlmondCoarseSolver.hpp"
 
+namespace libp {
+
 namespace parAlmond {
 
-void multigrid_t::Operator(occa::memory& o_RHS, occa::memory& o_X) {
+void multigrid_t::Operator(deviceMemory<dfloat>& o_RHS, deviceMemory<dfloat>& o_X) {
   if (ctype == KCYCLE) {
     kcycle(0, o_RHS, o_X);
   } else {
@@ -40,7 +41,7 @@ void multigrid_t::Operator(occa::memory& o_RHS, occa::memory& o_X) {
 }
 
 multigrid_t::multigrid_t(platform_t& _platform, settings_t& _settings,
-                         MPI_Comm _comm):
+                         comm_t _comm):
     platform(_platform), settings(_settings), comm(_comm) {
 
   //determine what sort of multigrid cycle to construct
@@ -80,70 +81,64 @@ multigrid_t::multigrid_t(platform_t& _platform, settings_t& _settings,
   coarsetype=COARSEEXACT;
 
   if (coarsetype==COARSEEXACT) {
-    coarseSolver = new exactSolver_t(_platform, _settings, _comm);
+    coarseSolver = std::make_shared<exactSolver_t>(_platform, _settings, _comm);
   } else {
-    coarseSolver = new oasSolver_t(_platform, _settings, _comm);
+    coarseSolver = std::make_shared<oasSolver_t>(_platform, _settings, _comm);
   }
 }
 
-multigrid_t::~multigrid_t() {
-  if (linearSolver) delete linearSolver;
-  if (coarseSolver) delete coarseSolver;
-  for (int n=0;n<numLevels;n++) delete levels[n];
-}
+void multigrid_t::AllocateLevelWorkSpace(const int k){
 
-void multigrid_t::AddLevel(multigridLevel* level){
+  multigridLevel& level = *levels[k];
 
   //If using an exact solver and this is the first level, setup a linearSovler
-  if (exact && numLevels==0) {
+  if (exact && k==0) {
     if (settings.compareSetting("PARALMOND CYCLE", "NONSYM"))
-      linearSolver = new pgmres(level->Nrows, level->Ncols - level->Nrows,
-                             platform, settings, comm);
+      linearSolver.Setup<LinearSolver::pgmres>(level.Nrows,
+                                               level.Ncols - level.Nrows,
+                                               platform, settings, comm);
     else
-      linearSolver = new pcg(level->Nrows, level->Ncols - level->Nrows,
-                             platform, settings, comm);
+      linearSolver.Setup<LinearSolver::pcg>(level.Nrows,
+                                            level.Ncols - level.Nrows,
+                                            platform, settings, comm);
   }
 
   if (ctype==KCYCLE) {
     //first level
-    if (reductionScratchBytes==0) {
-      reductionScratchBytes = 3*PARALMOND_NBLOCKS*sizeof(dfloat);
-      dfloat *dummy = (dfloat *) calloc(3*PARALMOND_NBLOCKS,sizeof(dfloat));
-      o_reductionScratch = platform.malloc(reductionScratchBytes, dummy);
-      reductionScratch = platform.hostMalloc(reductionScratchBytes, NULL, h_reductionScratch);
-      free(dummy);
+    if (NreductionScratch==0) {
+      NreductionScratch = 3*PARALMOND_NBLOCKS;
+      memory<dfloat> dummy(3*PARALMOND_NBLOCKS, 0.0);
+      reductionScratch = platform.hostMalloc<dfloat>(NreductionScratch, dummy);
+      o_reductionScratch = platform.malloc<dfloat>(NreductionScratch, dummy);
     }
 
     //extra stroage for kcycle vectors
-    if (numLevels>0 && numLevels<NUMKCYCLES+1) {
-      dfloat *dummy = (dfloat *) calloc(level->Ncols,sizeof(dfloat));
-      o_ck[numLevels] = platform.malloc(level->Ncols*sizeof(dfloat),dummy);
-      o_vk[numLevels] = platform.malloc(level->Nrows*sizeof(dfloat),dummy);
-      o_wk[numLevels] = platform.malloc(level->Nrows*sizeof(dfloat),dummy);
-      free(dummy);
+    if (k>0 && k<NUMKCYCLES+1) {
+      memory<dfloat> dummy(level.Ncols,0.0);
+      o_ck[k] = platform.malloc<dfloat>(level.Ncols,dummy);
+      o_vk[k] = platform.malloc<dfloat>(level.Nrows,dummy);
+      o_wk[k] = platform.malloc<dfloat>(level.Nrows,dummy);
     }
   }
 
   //allocate space for coarse rhs and x
-  if (numLevels>0) {
-    dfloat *dummy = (dfloat *) calloc(level->Ncols,sizeof(dfloat));
-    o_x[numLevels]   = platform.malloc(level->Ncols*sizeof(dfloat),dummy);
-    o_rhs[numLevels] = platform.malloc(level->Ncols*sizeof(dfloat),dummy);
-    free(dummy);
+  if (k>0) {
+    memory<dfloat> dummy(level.Ncols,0.0);
+    o_x[k]   = platform.malloc<dfloat>(level.Ncols,dummy);
+    o_rhs[k] = platform.malloc<dfloat>(level.Ncols,dummy);
   }
 
   //scratch space includes space for residual and 2 vectors used in Chebyshev smoothing
-  size_t requiredBytes = 2*level->Ncols*sizeof(dfloat);
-  if (requiredBytes>scratchSpaceBytes) {
-    scratchSpaceBytes = requiredBytes;
-    dfloat *dummy = (dfloat *) calloc(2*level->Ncols,sizeof(dfloat));
-    o_scratch = platform.malloc(requiredBytes, dummy);
-    free(dummy);
+  size_t Nrequired = 2*level.Ncols;
+  if (Nrequired>NscratchSpace) {
+    NscratchSpace = Nrequired;
+    memory<dfloat> dummy(2*level.Ncols,0.0);
+    o_scratch = platform.malloc<dfloat>(Nrequired, dummy);
   }
 
-  level->o_scratch = o_scratch;
-
-  levels[numLevels++] = level;
+  level.o_scratch = o_scratch;
 }
 
 } //namespace parAlmond
+
+} //namespace libp
diff --git a/libs/parAlmond/parAlmondSettings.cpp b/libs/parAlmond/parAlmondSettings.cpp
index d95349cda..e8dda1d63 100644
--- a/libs/parAlmond/parAlmondSettings.cpp
+++ b/libs/parAlmond/parAlmondSettings.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,10 +26,12 @@ SOFTWARE.
 
 #include "parAlmond.hpp"
 
+namespace libp {
+
 namespace parAlmond {
 
 void AddSettings(settings_t& settings,
-                          const string prefix) {
+                 const std::string prefix) {
 
   settings.newSetting(prefix+"PARALMOND CYCLE",
                       "VCYCLE",
@@ -67,4 +69,6 @@ void ReportSettings(settings_t& settings) {
     settings.reportSetting("PARALMOND CHEBYSHEV DEGREE");
 }
 
-} //namespace parAlmond
\ No newline at end of file
+} //namespace parAlmond
+
+} //namespace libp
diff --git a/libs/parAlmond/parAlmondSmoothPrologator.cpp b/libs/parAlmond/parAlmondSmoothPrologator.cpp
index 80263560a..ee8563563 100644
--- a/libs/parAlmond/parAlmondSmoothPrologator.cpp
+++ b/libs/parAlmond/parAlmondSmoothPrologator.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,14 +27,15 @@ SOFTWARE.
 #include "parAlmond.hpp"
 #include "parAlmond/parAlmondAMGSetup.hpp"
 
+namespace libp {
+
 namespace parAlmond {
 
-parCSR *smoothProlongator(parCSR *A, parCSR *T){
+parCSR smoothProlongator(parCSR& A, parCSR& T){
 
   // MPI info
-  int rank, size;
-  MPI_Comm_rank(A->comm, &rank);
-  MPI_Comm_size(A->comm, &size);
+  int rank = A.comm.rank();
+  int size = A.comm.size();
 
   // This function computes a smoothed prologation operator
   // via a single weighted Jacobi iteration on the tentative
@@ -44,81 +45,82 @@ parCSR *smoothProlongator(parCSR *A, parCSR *T){
   //
   // To compute D^{-1}*A*T we need all the rows T(j,:) for which
   // j is a column index for the nonzeros of A on this rank.
-  // For all local column indices in A->diag, we will already
+  // For all local column indices in A.diag, we will already
   // have the row of T on this rank, so we just need to gather
   // the offd colIds
 
   //Jacobi weight
-  const dfloat omega = (4./3.)/A->rho;
+  const dfloat omega = (4./3.)/A.rho;
 
-  hlong *recvRows = (hlong *) calloc(A->Ncols-A->NlocalCols, sizeof(hlong));
-  int *sendCounts = (int*) calloc(size, sizeof(int));
-  int *recvCounts = (int*) calloc(size, sizeof(int));
-  int *sendOffsets = (int*) calloc(size+1, sizeof(int));
-  int *recvOffsets = (int*) calloc(size+1, sizeof(int));
+  memory<hlong> recvRows(A.Ncols-A.NlocalCols);
+  memory<int> sendCounts(size);
+  memory<int> recvCounts(size, 0);
+  memory<int> sendOffsets(size+1);
+  memory<int> recvOffsets(size+1);
 
   //use the colMap of A to list the needed rows of T
   int r=0;
-  for (dlong n=A->NlocalCols;n<A->Ncols;n++) {
-    const hlong id = A->colMap[n];
-    while (id>=T->globalRowStarts[r+1]) r++; //assumes the halo is sorted
+  for (dlong n=A.NlocalCols;n<A.Ncols;n++) {
+    const hlong id = A.colMap[n];
+    while (id>=T.globalRowStarts[r+1]) r++; //assumes the halo is sorted
     recvCounts[r]++;
-    recvRows[n-A->NlocalCols] = id; //record the row to recv
+    recvRows[n-A.NlocalCols] = id; //record the row to recv
   }
 
   //share the counts
-  MPI_Alltoall(recvCounts, 1, MPI_INT,
-               sendCounts, 1, MPI_INT, A->comm);
+  A.comm.Alltoall(recvCounts, sendCounts);
 
+  sendOffsets[0] = 0;
+  recvOffsets[0] = 0;
   for (r=0;r<size;r++) {
     sendOffsets[r+1] = sendOffsets[r]+sendCounts[r];
     recvOffsets[r+1] = recvOffsets[r]+recvCounts[r];
   }
 
   int sendTotal = sendOffsets[size];
-  hlong *sendRows = (hlong *) calloc(sendTotal, sizeof(hlong));
+  memory<hlong> sendRows(sendTotal);
 
   //share the rowIds
-  MPI_Alltoallv(recvRows, recvCounts, recvOffsets, MPI_HLONG,
-                sendRows, sendCounts, sendOffsets, MPI_HLONG,
-                T->comm);
+  T.comm.Alltoallv(recvRows, recvCounts, recvOffsets,
+                   sendRows, sendCounts, sendOffsets);
 
   //we now have a list of rows to send, count the nnz to send
   dlong nnzTotal=0;
   for (r=0;r<size;r++) {
     sendCounts[r] =0; //reset
     for (int n=sendOffsets[r];n<sendOffsets[r+1];n++) {
-      dlong i = (dlong) (sendRows[n]-T->globalRowStarts[rank]); //local row id
-      sendCounts[r]+= T->diag.rowStarts[i+1]-T->diag.rowStarts[i]; //count entries in this row
-      sendCounts[r]+= T->offd.rowStarts[i+1]-T->offd.rowStarts[i]; //count entries in this row
+      dlong i = (dlong) (sendRows[n]-T.globalRowStarts[rank]); //local row id
+      sendCounts[r]+= T.diag.rowStarts[i+1]-T.diag.rowStarts[i]; //count entries in this row
+      sendCounts[r]+= T.offd.rowStarts[i+1]-T.offd.rowStarts[i]; //count entries in this row
     }
     nnzTotal += sendCounts[r]; //tally the total
   }
 
-  parCOO::nonZero_t *sendNonZeros = (parCOO::nonZero_t *) calloc(nnzTotal, sizeof(parCOO::nonZero_t));
+  memory<parCOO::nonZero_t> sendNonZeros(nnzTotal);
 
   nnzTotal=0; //reset
   for (r=0;r<size;r++) {
     for (int n=sendOffsets[r];n<sendOffsets[r+1];n++) {
-      dlong i = (dlong) (sendRows[n] - T->globalRowStarts[rank]); //local row id
-      for (dlong jj=T->diag.rowStarts[i]; jj<T->diag.rowStarts[i+1];jj++){
+      dlong i = (dlong) (sendRows[n] - T.globalRowStarts[rank]); //local row id
+      for (dlong jj=T.diag.rowStarts[i]; jj<T.diag.rowStarts[i+1];jj++){
         sendNonZeros[nnzTotal].row = sendRows[n];
-        sendNonZeros[nnzTotal].col = T->diag.cols[jj] + T->globalColStarts[rank];
-        sendNonZeros[nnzTotal].val = T->diag.vals[jj];
+        sendNonZeros[nnzTotal].col = T.diag.cols[jj] + T.globalColStarts[rank];
+        sendNonZeros[nnzTotal].val = T.diag.vals[jj];
         nnzTotal++;
       }
-      for (dlong jj=T->offd.rowStarts[i]; jj<T->offd.rowStarts[i+1];jj++){
+      for (dlong jj=T.offd.rowStarts[i]; jj<T.offd.rowStarts[i+1];jj++){
         sendNonZeros[nnzTotal].row = sendRows[n];
-        sendNonZeros[nnzTotal].col = T->colMap[T->offd.cols[jj]];
-        sendNonZeros[nnzTotal].val = T->offd.vals[jj];
+        sendNonZeros[nnzTotal].col = T.colMap[T.offd.cols[jj]];
+        sendNonZeros[nnzTotal].val = T.offd.vals[jj];
         nnzTotal++;
       }
     }
   }
 
-  MPI_Alltoall(sendCounts, 1, MPI_INT,
-               recvCounts, 1, MPI_INT, A->comm);
+  A.comm.Alltoall(sendCounts, recvCounts);
 
+  sendOffsets[0] = 0;
+  recvOffsets[0] = 0;
   for (r=0;r<size;r++) {
     sendOffsets[r+1] = sendOffsets[r]+sendCounts[r];
     recvOffsets[r+1] = recvOffsets[r]+recvCounts[r];
@@ -126,37 +128,27 @@ parCSR *smoothProlongator(parCSR *A, parCSR *T){
 
 
   dlong Toffdnnz = recvOffsets[size]; //total nonzeros
-  parCOO::nonZero_t *ToffdRows = (parCOO::nonZero_t *)
-                                 calloc(Toffdnnz, sizeof(parCOO::nonZero_t));
-
-  MPI_Alltoallv(sendNonZeros, sendCounts, sendOffsets, MPI_NONZERO_T,
-                ToffdRows, recvCounts, recvOffsets, MPI_NONZERO_T,
-                T->comm);
+  memory<parCOO::nonZero_t> ToffdRows(Toffdnnz);
 
-  //clean up
-  MPI_Barrier(T->comm);
-  free(sendNonZeros);
-  free(sendCounts);
-  free(recvCounts);
-  free(sendOffsets);
-  free(recvOffsets);
+  T.comm.Alltoallv(sendNonZeros, sendCounts, sendOffsets,
+                   ToffdRows, recvCounts, recvOffsets);
 
   //we now have all the needed nonlocal rows (should also be sorted by row then col)
 
   //make an array of row offsets so we know how large each row is
-  dlong *ToffdRowOffsets = (dlong *) calloc(A->Ncols-A->NlocalCols+1, sizeof(dlong));
+  memory<dlong> ToffdRowOffsets(A.Ncols-A.NlocalCols+1, 0);
 
   dlong id=0;
   for (dlong n=0;n<Toffdnnz;n++) {
     hlong row = ToffdRows[n].row;
 
-    while(A->colMap[id+A->NlocalCols]!=row) id++;
+    while(A.colMap[id+A.NlocalCols]!=row) id++;
 
     ToffdRowOffsets[id+1]++; //count entry in row
   }
 
   //cumulative sum
-  for (dlong n=0;n<A->Ncols-A->NlocalCols;n++)
+  for (dlong n=0;n<A.Ncols-A.NlocalCols;n++)
     ToffdRowOffsets[n+1] += ToffdRowOffsets[n];
 
 
@@ -165,110 +157,106 @@ parCSR *smoothProlongator(parCSR *A, parCSR *T){
   // the entries
 
   // Find how big the intermediate form is
-  nnzTotal = T->diag.nnz+T->offd.nnz; //start with T populated
+  nnzTotal = T.diag.nnz+T.offd.nnz; //start with T populated
 
-  for (dlong i=0;i<A->Nrows;i++) {
+  for (dlong i=0;i<A.Nrows;i++) {
     //local entries
-    dlong start = A->diag.rowStarts[i];
-    dlong end   = A->diag.rowStarts[i+1];
+    dlong start = A.diag.rowStarts[i];
+    dlong end   = A.diag.rowStarts[i+1];
     for (dlong j=start;j<end;j++) {
-      const dlong col = A->diag.cols[j];
-      const int nnzBj =  T->diag.rowStarts[col+1]-T->diag.rowStarts[col]
-                        +T->offd.rowStarts[col+1]-T->offd.rowStarts[col];
+      const dlong col = A.diag.cols[j];
+      const int nnzBj =  T.diag.rowStarts[col+1]-T.diag.rowStarts[col]
+                        +T.offd.rowStarts[col+1]-T.offd.rowStarts[col];
       nnzTotal += nnzBj;
     }
     //non-local entries
-    start = A->offd.rowStarts[i];
-    end   = A->offd.rowStarts[i+1];
+    start = A.offd.rowStarts[i];
+    end   = A.offd.rowStarts[i+1];
     for (dlong j=start;j<end;j++) {
-      const dlong col = A->offd.cols[j]-A->NlocalCols;
+      const dlong col = A.offd.cols[j]-A.NlocalCols;
       const int nnzBj = ToffdRowOffsets[col+1] - ToffdRowOffsets[col];
       nnzTotal += nnzBj;
     }
   }
 
-  parCOO::nonZero_t *Ptmp = (parCOO::nonZero_t *)
-                            calloc(nnzTotal, sizeof(parCOO::nonZero_t));
+  memory<parCOO::nonZero_t> Ptmp(nnzTotal);
 
   // Fill the intermediate form of P
   dlong cnt = 0;
   //First P = T
-  for (dlong i=0;i<T->Nrows;i++) {
+  for (dlong i=0;i<T.Nrows;i++) {
     //local T entries
-    dlong start = T->diag.rowStarts[i];
-    dlong end   = T->diag.rowStarts[i+1];
+    dlong start = T.diag.rowStarts[i];
+    dlong end   = T.diag.rowStarts[i+1];
     for (dlong j=start;j<end;j++) {
-      Ptmp[cnt].row = i + T->globalRowStarts[rank];
-      Ptmp[cnt].col = T->diag.cols[j]+T->globalColStarts[rank]; //global id
-      Ptmp[cnt].val = T->diag.vals[j];
+      Ptmp[cnt].row = i + T.globalRowStarts[rank];
+      Ptmp[cnt].col = T.diag.cols[j]+T.globalColStarts[rank]; //global id
+      Ptmp[cnt].val = T.diag.vals[j];
       cnt++;
     }
     //non-local T entries
-    start = T->offd.rowStarts[i];
-    end   = T->offd.rowStarts[i+1];
+    start = T.offd.rowStarts[i];
+    end   = T.offd.rowStarts[i+1];
     for (dlong j=start;j<end;j++) {
-      Ptmp[cnt].row = i + T->globalRowStarts[rank];
-      Ptmp[cnt].col = T->colMap[T->offd.cols[j]];
-      Ptmp[cnt].val = T->offd.vals[j];
+      Ptmp[cnt].row = i + T.globalRowStarts[rank];
+      Ptmp[cnt].col = T.colMap[T.offd.cols[j]];
+      Ptmp[cnt].val = T.offd.vals[j];
       cnt++;
     }
   }
 
   //Then P -= omega*invD*A*T
-  for (dlong i=0;i<A->Nrows;i++) {
+  for (dlong i=0;i<A.Nrows;i++) {
     //local A entries
-    dlong start = A->diag.rowStarts[i];
-    dlong end   = A->diag.rowStarts[i+1];
+    dlong start = A.diag.rowStarts[i];
+    dlong end   = A.diag.rowStarts[i+1];
 
-    const dfloat invDi = 1.0/A->diagA[i];
+    const dfloat invDi = 1.0/A.diagA[i];
 
     for (dlong j=start;j<end;j++) {
-      const dlong col = A->diag.cols[j];
-      const dfloat Aval = -omega*invDi*A->diag.vals[j];
+      const dlong col = A.diag.cols[j];
+      const dfloat Aval = -omega*invDi*A.diag.vals[j];
 
       //local T entries
-      dlong Tstart = T->diag.rowStarts[col];
-      dlong Tend   = T->diag.rowStarts[col+1];
+      dlong Tstart = T.diag.rowStarts[col];
+      dlong Tend   = T.diag.rowStarts[col+1];
       for (dlong jj=Tstart;jj<Tend;jj++) {
-        Ptmp[cnt].row = i + A->globalRowStarts[rank];
-        Ptmp[cnt].col = T->diag.cols[jj]+T->globalColStarts[rank]; //global id
-        Ptmp[cnt].val = Aval*T->diag.vals[jj];
+        Ptmp[cnt].row = i + A.globalRowStarts[rank];
+        Ptmp[cnt].col = T.diag.cols[jj]+T.globalColStarts[rank]; //global id
+        Ptmp[cnt].val = Aval*T.diag.vals[jj];
         cnt++;
       }
       //non-local T entries
-      Tstart = T->offd.rowStarts[col];
-      Tend   = T->offd.rowStarts[col+1];
+      Tstart = T.offd.rowStarts[col];
+      Tend   = T.offd.rowStarts[col+1];
       for (dlong jj=Tstart;jj<Tend;jj++) {
-        Ptmp[cnt].row = i + A->globalRowStarts[rank];
-        Ptmp[cnt].col = T->colMap[T->offd.cols[jj]]; //global id
-        Ptmp[cnt].val = Aval*T->offd.vals[jj];
+        Ptmp[cnt].row = i + A.globalRowStarts[rank];
+        Ptmp[cnt].col = T.colMap[T.offd.cols[jj]]; //global id
+        Ptmp[cnt].val = Aval*T.offd.vals[jj];
         cnt++;
       }
     }
     //non-local A entries
-    start = A->offd.rowStarts[i];
-    end   = A->offd.rowStarts[i+1];
+    start = A.offd.rowStarts[i];
+    end   = A.offd.rowStarts[i+1];
     for (dlong j=start;j<end;j++) {
-      const dlong col = A->offd.cols[j]-A->NlocalCols;
-      const dfloat Aval = -omega*invDi*A->offd.vals[j];
+      const dlong col = A.offd.cols[j]-A.NlocalCols;
+      const dfloat Aval = -omega*invDi*A.offd.vals[j];
 
       // entries from recived rows of T
       dlong Tstart = ToffdRowOffsets[col];
       dlong Tend   = ToffdRowOffsets[col+1];
       for (dlong jj=Tstart;jj<Tend;jj++) {
-        Ptmp[cnt].row = i + A->globalRowStarts[rank];
+        Ptmp[cnt].row = i + A.globalRowStarts[rank];
         Ptmp[cnt].col = ToffdRows[jj].col; //global id
         Ptmp[cnt].val = Aval*ToffdRows[jj].val;
         cnt++;
       }
     }
   }
-  free(ToffdRowOffsets);
-  free(ToffdRows);
-
 
   //sort entries by the row and col
-  std::sort(Ptmp, Ptmp+nnzTotal,
+  std::sort(Ptmp.ptr(), Ptmp.ptr()+nnzTotal,
             [](const parCOO::nonZero_t& a, const parCOO::nonZero_t& b) {
               if (a.row < b.row) return true;
               if (a.row > b.row) return false;
@@ -283,16 +271,14 @@ parCSR *smoothProlongator(parCSR *A, parCSR *T){
     if ((Ptmp[i].row!=Ptmp[i-1].row)||
         (Ptmp[i].col!=Ptmp[i-1].col)) nnz++;
 
-  parCOO cooP(A->platform, A->comm);
+  parCOO cooP(A.platform, A.comm);
 
   //copy global partition
-  cooP.globalRowStarts = (hlong *) calloc(size+1,sizeof(hlong));
-  cooP.globalColStarts = (hlong *) calloc(size+1,sizeof(hlong));
-  memcpy(cooP.globalRowStarts, A->globalRowStarts, (size+1)*sizeof(hlong));
-  memcpy(cooP.globalColStarts, T->globalColStarts, (size+1)*sizeof(hlong));
+  cooP.globalRowStarts = A.globalRowStarts;
+  cooP.globalColStarts = T.globalColStarts;
 
   cooP.nnz = nnz;
-  cooP.entries = (parCOO::nonZero_t *) calloc(nnz,sizeof(parCOO::nonZero_t));
+  cooP.entries.malloc(nnz);
 
   //compress nonzeros
   nnz = 0;
@@ -305,11 +291,11 @@ parCSR *smoothProlongator(parCSR *A, parCSR *T){
       cooP.entries[nnz-1].val += Ptmp[i].val;
     }
   }
-  //clean up
-  free(Ptmp);
 
   //build P from coo matrix
-  return new parCSR(cooP);
+  return parCSR(cooP);
 }
 
-} //namespace parAlmond
\ No newline at end of file
+} //namespace parAlmond
+
+} //namespace libp
diff --git a/libs/parAlmond/parAlmondSpMM.cpp b/libs/parAlmond/parAlmondSpMM.cpp
index 36d013232..7b67ee1af 100644
--- a/libs/parAlmond/parAlmondSpMM.cpp
+++ b/libs/parAlmond/parAlmondSpMM.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,88 +27,88 @@ SOFTWARE.
 #include "parAlmond.hpp"
 #include "parAlmond/parAlmondAMGSetup.hpp"
 
+namespace libp {
+
 namespace parAlmond {
 
-parCSR *SpMM(parCSR *A, parCSR *B){
+parCSR SpMM(parCSR& A, parCSR& B){
 
   // MPI info
-  int rank, size;
-  MPI_Comm_rank(A->comm, &rank);
-  MPI_Comm_size(A->comm, &size);
+  int rank = A.comm.rank();
+  int size = A.comm.size();
 
   // To compute C = A*B we need all the rows B(j,:) for which
   // j is a column index for the nonzeros of A on this rank.
-  // For all local column indices in A->diag, we will already
+  // For all local column indices in A.diag, we will already
   // have the row of B on this rank, so we just need to gather
   // the offd colIds
 
-  hlong *recvRows = (hlong *) calloc(A->Ncols-A->NlocalCols, sizeof(hlong));
-  int *sendCounts = (int*) calloc(size, sizeof(int));
-  int *recvCounts = (int*) calloc(size, sizeof(int));
-  int *sendOffsets = (int*) calloc(size+1, sizeof(int));
-  int *recvOffsets = (int*) calloc(size+1, sizeof(int));
+  memory<hlong> recvRows(A.Ncols-A.NlocalCols);
+  memory<int> sendCounts(size);
+  memory<int> recvCounts(size, 0);
+  memory<int> sendOffsets(size+1);
+  memory<int> recvOffsets(size+1);
 
   //use the colMap of A to list the needed rows of B
   int r=0;
-  for (dlong n=A->NlocalCols;n<A->Ncols;n++) {
-    const hlong id = A->colMap[n];
-    while (id>=B->globalRowStarts[r+1]) r++; //assumes the halo is sorted
+  for (dlong n=A.NlocalCols;n<A.Ncols;n++) {
+    const hlong id = A.colMap[n];
+    while (id>=B.globalRowStarts[r+1]) r++; //assumes the halo is sorted
     recvCounts[r]++;
-    recvRows[n-A->NlocalCols] = id; //record the row to recv
+    recvRows[n-A.NlocalCols] = id; //record the row to recv
   }
 
   //share the counts
-  MPI_Alltoall(recvCounts, 1, MPI_INT,
-               sendCounts, 1, MPI_INT, A->comm);
+  A.comm.Alltoall(recvCounts, sendCounts);
 
+  sendOffsets[0] = 0;
+  recvOffsets[0] = 0;
   for (r=0;r<size;r++) {
     sendOffsets[r+1] = sendOffsets[r]+sendCounts[r];
     recvOffsets[r+1] = recvOffsets[r]+recvCounts[r];
   }
 
   int sendTotal = sendOffsets[size];
-  hlong *sendRows = (hlong *) calloc(sendTotal, sizeof(hlong));
+  memory<hlong> sendRows(sendTotal);
 
   //share the rowIds
-  MPI_Alltoallv(recvRows, recvCounts, recvOffsets, MPI_HLONG,
-                sendRows, sendCounts, sendOffsets, MPI_HLONG,
-                B->comm);
+  B.comm.Alltoallv(recvRows, recvCounts, recvOffsets,
+                   sendRows, sendCounts, sendOffsets);
 
   //we now have a list of rows to send, count the nnz to send
   dlong nnzTotal=0;
   for (r=0;r<size;r++) {
     sendCounts[r] =0; //reset
     for (int n=sendOffsets[r];n<sendOffsets[r+1];n++) {
-      dlong i = (dlong) (sendRows[n]-B->globalRowStarts[rank]); //local row id
-      sendCounts[r]+= B->diag.rowStarts[i+1]-B->diag.rowStarts[i]; //count entries in this row
-      sendCounts[r]+= B->offd.rowStarts[i+1]-B->offd.rowStarts[i]; //count entries in this row
+      dlong i = static_cast<dlong>(sendRows[n]-B.globalRowStarts[rank]); //local row id
+      sendCounts[r]+= B.diag.rowStarts[i+1]-B.diag.rowStarts[i]; //count entries in this row
+      sendCounts[r]+= B.offd.rowStarts[i+1]-B.offd.rowStarts[i]; //count entries in this row
     }
     nnzTotal += sendCounts[r]; //tally the total
   }
 
-  parCOO::nonZero_t *sendNonZeros = (parCOO::nonZero_t *) calloc(nnzTotal, sizeof(parCOO::nonZero_t));
+  memory<parCOO::nonZero_t> sendNonZeros(nnzTotal);
 
   nnzTotal=0; //reset
   for (r=0;r<size;r++) {
     for (int n=sendOffsets[r];n<sendOffsets[r+1];n++) {
-      dlong i = (dlong) (sendRows[n] - B->globalRowStarts[rank]); //local row id
-      for (dlong jj=B->diag.rowStarts[i]; jj<B->diag.rowStarts[i+1];jj++){
+      dlong i = static_cast<dlong>(sendRows[n] - B.globalRowStarts[rank]); //local row id
+      for (dlong jj=B.diag.rowStarts[i]; jj<B.diag.rowStarts[i+1];jj++){
         sendNonZeros[nnzTotal].row = sendRows[n];
-        sendNonZeros[nnzTotal].col = B->diag.cols[jj] + B->globalColStarts[rank];
-        sendNonZeros[nnzTotal].val = B->diag.vals[jj];
+        sendNonZeros[nnzTotal].col = B.diag.cols[jj] + B.globalColStarts[rank];
+        sendNonZeros[nnzTotal].val = B.diag.vals[jj];
         nnzTotal++;
       }
-      for (dlong jj=B->offd.rowStarts[i]; jj<B->offd.rowStarts[i+1];jj++){
+      for (dlong jj=B.offd.rowStarts[i]; jj<B.offd.rowStarts[i+1];jj++){
         sendNonZeros[nnzTotal].row = sendRows[n];
-        sendNonZeros[nnzTotal].col = B->colMap[B->offd.cols[jj]];
-        sendNonZeros[nnzTotal].val = B->offd.vals[jj];
+        sendNonZeros[nnzTotal].col = B.colMap[B.offd.cols[jj]];
+        sendNonZeros[nnzTotal].val = B.offd.vals[jj];
         nnzTotal++;
       }
     }
   }
 
-  MPI_Alltoall(sendCounts, 1, MPI_INT,
-               recvCounts, 1, MPI_INT, A->comm);
+  A.comm.Alltoall(sendCounts, recvCounts);
 
   for (r=0;r<size;r++) {
     sendOffsets[r+1] = sendOffsets[r]+sendCounts[r];
@@ -117,37 +117,27 @@ parCSR *SpMM(parCSR *A, parCSR *B){
 
 
   dlong Boffdnnz = recvOffsets[size]; //total nonzeros
-  parCOO::nonZero_t *BoffdRows = (parCOO::nonZero_t *)
-                                 calloc(Boffdnnz, sizeof(parCOO::nonZero_t));
-
-  MPI_Alltoallv(sendNonZeros, sendCounts, sendOffsets, MPI_NONZERO_T,
-                BoffdRows, recvCounts, recvOffsets, MPI_NONZERO_T,
-                B->comm);
+  memory<parCOO::nonZero_t> BoffdRows(Boffdnnz);
 
-  //clean up
-  MPI_Barrier(B->comm);
-  free(sendNonZeros);
-  free(sendCounts);
-  free(recvCounts);
-  free(sendOffsets);
-  free(recvOffsets);
+  B.comm.Alltoallv(sendNonZeros, sendCounts, sendOffsets,
+                   BoffdRows, recvCounts, recvOffsets);
 
   //we now have all the needed nonlocal rows (should also be sorted by row then col)
 
   //make an array of row offsets so we know how large each row is
-  dlong *BoffdRowOffsets = (dlong *) calloc(A->Ncols-A->NlocalCols+1, sizeof(dlong));
+  memory<dlong> BoffdRowOffsets(A.Ncols-A.NlocalCols+1, 0);
 
   dlong id=0;
   for (dlong n=0;n<Boffdnnz;n++) {
     hlong row = BoffdRows[n].row;
 
-    while(A->colMap[id+A->NlocalCols]!=row) id++;
+    while(A.colMap[id+A.NlocalCols]!=row) id++;
 
     BoffdRowOffsets[id+1]++; //count entry in row
   }
 
   //cumulative sum
-  for (dlong n=0;n<A->Ncols-A->NlocalCols;n++)
+  for (dlong n=0;n<A.Ncols-A.NlocalCols;n++)
     BoffdRowOffsets[n+1] += BoffdRowOffsets[n];
 
 
@@ -157,82 +147,78 @@ parCSR *SpMM(parCSR *A, parCSR *B){
 
   // Find how big the intermediate form is
   nnzTotal = 0;
-  for (dlong i=0;i<A->Nrows;i++) {
+  for (dlong i=0;i<A.Nrows;i++) {
     //local entries
-    dlong start = A->diag.rowStarts[i];
-    dlong end   = A->diag.rowStarts[i+1];
+    dlong start = A.diag.rowStarts[i];
+    dlong end   = A.diag.rowStarts[i+1];
     for (dlong j=start;j<end;j++) {
-      const dlong col = A->diag.cols[j];
-      const int nnzBj =  B->diag.rowStarts[col+1]-B->diag.rowStarts[col]
-                        +B->offd.rowStarts[col+1]-B->offd.rowStarts[col];
+      const dlong col = A.diag.cols[j];
+      const int nnzBj =  B.diag.rowStarts[col+1]-B.diag.rowStarts[col]
+                        +B.offd.rowStarts[col+1]-B.offd.rowStarts[col];
       nnzTotal += nnzBj;
     }
     //non-local entries
-    start = A->offd.rowStarts[i];
-    end   = A->offd.rowStarts[i+1];
+    start = A.offd.rowStarts[i];
+    end   = A.offd.rowStarts[i+1];
     for (dlong j=start;j<end;j++) {
-      const dlong col = A->offd.cols[j]-A->NlocalCols;
+      const dlong col = A.offd.cols[j]-A.NlocalCols;
       const int nnzBj = BoffdRowOffsets[col+1] - BoffdRowOffsets[col];
       nnzTotal += nnzBj;
     }
   }
 
-  parCOO::nonZero_t *Ctmp = (parCOO::nonZero_t *)
-                            calloc(nnzTotal, sizeof(parCOO::nonZero_t));
+  memory<parCOO::nonZero_t> Ctmp(nnzTotal);
 
   // Fill the intermediate form of C
   dlong cnt = 0;
-  for (dlong i=0;i<A->Nrows;i++) {
+  for (dlong i=0;i<A.Nrows;i++) {
     //local A entries
-    dlong start = A->diag.rowStarts[i];
-    dlong end   = A->diag.rowStarts[i+1];
+    dlong start = A.diag.rowStarts[i];
+    dlong end   = A.diag.rowStarts[i+1];
     for (dlong j=start;j<end;j++) {
-      const dlong col = A->diag.cols[j];
-      const dfloat Aval = A->diag.vals[j];
+      const dlong col = A.diag.cols[j];
+      const dfloat Aval = A.diag.vals[j];
 
       //local B entries
-      dlong Bstart = B->diag.rowStarts[col];
-      dlong Bend   = B->diag.rowStarts[col+1];
+      dlong Bstart = B.diag.rowStarts[col];
+      dlong Bend   = B.diag.rowStarts[col+1];
       for (dlong jj=Bstart;jj<Bend;jj++) {
-        Ctmp[cnt].row = i + A->globalRowStarts[rank];
-        Ctmp[cnt].col = B->diag.cols[jj]+B->globalColStarts[rank]; //global id
-        Ctmp[cnt].val = Aval*B->diag.vals[jj];
+        Ctmp[cnt].row = i + A.globalRowStarts[rank];
+        Ctmp[cnt].col = B.diag.cols[jj]+B.globalColStarts[rank]; //global id
+        Ctmp[cnt].val = Aval*B.diag.vals[jj];
         cnt++;
       }
       //non-local B entries
-      Bstart = B->offd.rowStarts[col];
-      Bend   = B->offd.rowStarts[col+1];
+      Bstart = B.offd.rowStarts[col];
+      Bend   = B.offd.rowStarts[col+1];
       for (dlong jj=Bstart;jj<Bend;jj++) {
-        Ctmp[cnt].row = i + A->globalRowStarts[rank];
-        Ctmp[cnt].col = B->colMap[B->offd.cols[jj]]; //global id
-        Ctmp[cnt].val = Aval*B->offd.vals[jj];
+        Ctmp[cnt].row = i + A.globalRowStarts[rank];
+        Ctmp[cnt].col = B.colMap[B.offd.cols[jj]]; //global id
+        Ctmp[cnt].val = Aval*B.offd.vals[jj];
         cnt++;
       }
     }
     //non-local A entries
-    start = A->offd.rowStarts[i];
-    end   = A->offd.rowStarts[i+1];
+    start = A.offd.rowStarts[i];
+    end   = A.offd.rowStarts[i+1];
     for (dlong j=start;j<end;j++) {
-      const dlong col = A->offd.cols[j]-A->NlocalCols;
-      const dfloat Aval = A->offd.vals[j];
+      const dlong col = A.offd.cols[j]-A.NlocalCols;
+      const dfloat Aval = A.offd.vals[j];
 
       // entries from recived rows of B
       dlong Bstart = BoffdRowOffsets[col];
       dlong Bend   = BoffdRowOffsets[col+1];
       for (dlong jj=Bstart;jj<Bend;jj++) {
-        Ctmp[cnt].row = i + A->globalRowStarts[rank];
+        Ctmp[cnt].row = i + A.globalRowStarts[rank];
         Ctmp[cnt].col = BoffdRows[jj].col; //global id
         Ctmp[cnt].val = Aval*BoffdRows[jj].val;
         cnt++;
       }
     }
   }
-  free(BoffdRowOffsets);
-  free(BoffdRows);
-
 
   //sort entries by the row and col
-  std::sort(Ctmp, Ctmp+nnzTotal,
+  std::sort(Ctmp.ptr(), Ctmp.ptr()+nnzTotal,
             [](const parCOO::nonZero_t& a, const parCOO::nonZero_t& b) {
               if (a.row < b.row) return true;
               if (a.row > b.row) return false;
@@ -247,16 +233,14 @@ parCSR *SpMM(parCSR *A, parCSR *B){
     if ((Ctmp[i].row!=Ctmp[i-1].row)||
         (Ctmp[i].col!=Ctmp[i-1].col)) nnz++;
 
-  parCOO cooC(A->platform, A->comm);
+  parCOO cooC(A.platform, A.comm);
 
   //copy global partition
-  cooC.globalRowStarts = (hlong *) calloc(size+1,sizeof(hlong));
-  cooC.globalColStarts = (hlong *) calloc(size+1,sizeof(hlong));
-  memcpy(cooC.globalRowStarts, A->globalRowStarts, (size+1)*sizeof(hlong));
-  memcpy(cooC.globalColStarts, B->globalColStarts, (size+1)*sizeof(hlong));
+  cooC.globalRowStarts = A.globalRowStarts;
+  cooC.globalColStarts = B.globalColStarts;
 
   cooC.nnz = nnz;
-  cooC.entries = (parCOO::nonZero_t *) calloc(nnz,sizeof(parCOO::nonZero_t));
+  cooC.entries.malloc(nnz);
 
   //compress nonzeros
   nnz = 0;
@@ -269,11 +253,11 @@ parCSR *SpMM(parCSR *A, parCSR *B){
       cooC.entries[nnz-1].val += Ctmp[i].val;
     }
   }
-  //clean up
-  free(Ctmp);
 
   //build C from coo matrix
-  return new parCSR(cooC);
+  return parCSR(cooC);
 }
 
-} //namespace parAlmond
\ No newline at end of file
+} //namespace parAlmond
+
+} //namespace libp
diff --git a/libs/parAlmond/parAlmondStrongGraph.cpp b/libs/parAlmond/parAlmondStrongGraph.cpp
index 6453ad2c3..5fa6a6e1e 100644
--- a/libs/parAlmond/parAlmondStrongGraph.cpp
+++ b/libs/parAlmond/parAlmondStrongGraph.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,12 +27,14 @@ SOFTWARE.
 #include "parAlmond.hpp"
 #include "parAlmond/parAlmondAMGSetup.hpp"
 
+namespace libp {
+
 namespace parAlmond {
 
-static strongGraph_t* RugeStubenStrength(parCSR *A, dfloat theta);
-static strongGraph_t* SymmetricStrength(parCSR *A, dfloat theta);
+static strongGraph_t RugeStubenStrength(parCSR& A, dfloat theta);
+static strongGraph_t SymmetricStrength(parCSR& A, dfloat theta);
 
-strongGraph_t* strongGraph(parCSR *A, StrengthType type, dfloat theta){
+strongGraph_t strongGraph(parCSR& A, StrengthType type, dfloat theta){
 
   if (type==RUGESTUBEN) {
     return RugeStubenStrength(A, theta);
@@ -42,19 +44,18 @@ strongGraph_t* strongGraph(parCSR *A, StrengthType type, dfloat theta){
 
 }
 
-static strongGraph_t* RugeStubenStrength(parCSR *A, dfloat theta) {
+static strongGraph_t RugeStubenStrength(parCSR& A, dfloat theta) {
 
-  const dlong N = A->Nrows;
-  const dlong M = A->Ncols;
+  const dlong N = A.Nrows;
+  const dlong M = A.Ncols;
 
-  strongGraph_t *C = new strongGraph_t(N, M, A->platform, A->comm);
+  strongGraph_t C(N, M, A.platform, A.comm);
 
-  C->rowStarts = (dlong *) calloc(N+1,sizeof(dlong));
+  C.rowStarts.malloc(N+1);
 
-  dfloat *maxOD = nullptr;
-  maxOD = (dfloat *) calloc(N,sizeof(dfloat));
+  memory<dfloat> maxOD(N,0.0);
 
-  dfloat *diagA = A->diagA;
+  memory<dfloat> diagA = A.diagA;
 
   //find maxOD
   // #pragma omp parallel for
@@ -62,49 +63,50 @@ static strongGraph_t* RugeStubenStrength(parCSR *A, dfloat theta) {
     const int sign = (diagA[i] >= 0) ? 1:-1;
 
     //local entries
-    dlong Jstart = A->diag.rowStarts[i];
-    dlong Jend   = A->diag.rowStarts[i+1];
+    dlong Jstart = A.diag.rowStarts[i];
+    dlong Jend   = A.diag.rowStarts[i+1];
     for(dlong jj= Jstart; jj<Jend; jj++){
-      const dlong col = A->diag.cols[jj];
+      const dlong col = A.diag.cols[jj];
       if (col==i) continue;
-      const dfloat OD = -sign*A->diag.vals[jj];
+      const dfloat OD = -sign*A.diag.vals[jj];
       if(OD > maxOD[i]) maxOD[i] = OD;
     }
     //non-local entries
-    Jstart = A->offd.rowStarts[i];
-    Jend   = A->offd.rowStarts[i+1];
+    Jstart = A.offd.rowStarts[i];
+    Jend   = A.offd.rowStarts[i+1];
     for(dlong jj= Jstart; jj<Jend; jj++){
-      dfloat OD = -sign*A->offd.vals[jj];
+      dfloat OD = -sign*A.offd.vals[jj];
       if(OD > maxOD[i]) maxOD[i] = OD;
     }
 
     int strong_per_row = 1; // diagonal entry
 
     //local entries
-    Jstart = A->diag.rowStarts[i];
-    Jend   = A->diag.rowStarts[i+1];
+    Jstart = A.diag.rowStarts[i];
+    Jend   = A.diag.rowStarts[i+1];
     for(dlong jj = Jstart; jj<Jend; jj++){
-      const dlong col = A->diag.cols[jj];
+      const dlong col = A.diag.cols[jj];
       if (col==i) continue;
-      const dfloat OD = -sign*A->diag.vals[jj];
+      const dfloat OD = -sign*A.diag.vals[jj];
       if(OD > theta*maxOD[i]) strong_per_row++;
     }
     //non-local entries
-    Jstart = A->offd.rowStarts[i];
-    Jend   = A->offd.rowStarts[i+1];
+    Jstart = A.offd.rowStarts[i];
+    Jend   = A.offd.rowStarts[i+1];
     for(dlong jj= Jstart; jj<Jend; jj++){
-      const dfloat OD = -sign*A->offd.vals[jj];
+      const dfloat OD = -sign*A.offd.vals[jj];
       if(OD > theta*maxOD[i]) strong_per_row++;
     }
-    C->rowStarts[i+1] = strong_per_row;
+    C.rowStarts[i+1] = strong_per_row;
   }
 
   // cumulative sum
+  C.rowStarts[0] = 0.0;
   for(dlong i=1; i<N+1 ; i++) {
-    C->rowStarts[i] += C->rowStarts[i-1];
+    C.rowStarts[i] += C.rowStarts[i-1];
   }
-  C->nnz = C->rowStarts[N];
-  C->cols = (dlong *) malloc(C->nnz*sizeof(dlong));
+  C.nnz = C.rowStarts[N];
+  C.cols.malloc(C.nnz);
 
 
   // fill in the columns for strong connections
@@ -112,124 +114,126 @@ static strongGraph_t* RugeStubenStrength(parCSR *A, dfloat theta) {
   for(dlong i=0; i<N; i++){
     const int sign = (diagA[i] >= 0) ? 1:-1;
 
-    dlong counter = C->rowStarts[i];
+    dlong counter = C.rowStarts[i];
 
     //local entries
-    dlong Jstart = A->diag.rowStarts[i];
-    dlong Jend   = A->diag.rowStarts[i+1];
+    dlong Jstart = A.diag.rowStarts[i];
+    dlong Jend   = A.diag.rowStarts[i+1];
     for(dlong jj = Jstart; jj<Jend; jj++){
-      const dlong col = A->diag.cols[jj];
+      const dlong col = A.diag.cols[jj];
       if (col==i) {
-        C->cols[counter++] = col;// diag entry
+        C.cols[counter++] = col;// diag entry
         continue;
       }
 
-      const dfloat OD = -sign*A->diag.vals[jj];
+      const dfloat OD = -sign*A.diag.vals[jj];
       if(OD > theta*maxOD[i])
-        C->cols[counter++] = col;
+        C.cols[counter++] = col;
     }
     //nonlocal entries
-    Jstart = A->offd.rowStarts[i];
-    Jend = A->offd.rowStarts[i+1];
+    Jstart = A.offd.rowStarts[i];
+    Jend = A.offd.rowStarts[i+1];
     for(dlong jj = Jstart; jj<Jend; jj++){
-      const dlong col = A->offd.cols[jj];
-      const dfloat OD = -sign*A->offd.vals[jj];
+      const dlong col = A.offd.cols[jj];
+      const dfloat OD = -sign*A.offd.vals[jj];
       if(OD > theta*maxOD[i])
-        C->cols[counter++] = col;
+        C.cols[counter++] = col;
     }
   }
-  free(maxOD);
 
   return C;
 }
 
-static strongGraph_t* SymmetricStrength(parCSR *A, dfloat theta) {
+static strongGraph_t SymmetricStrength(parCSR& A, dfloat theta) {
 
-  const dlong N = A->Nrows;
-  const dlong M = A->Ncols;
+  const dlong N = A.Nrows;
+  const dlong M = A.Ncols;
 
-  strongGraph_t *C = new strongGraph_t(N, M, A->platform, A->comm);
+  strongGraph_t C(N, M, A.platform, A.comm);
 
-  C->rowStarts = (dlong *) calloc(N+1,sizeof(dlong));
+  C.rowStarts.malloc(N+1);
 
-  dfloat *diagA = A->diagA;
+  memory<dfloat> diagA = A.diagA;
 
   // #pragma omp parallel for
   for(dlong i=0; i<N; i++){
     int strong_per_row = 1; // diagonal entry
 
-    const dfloat Aii = fabs(diagA[i]);
+    const dfloat Aii = std::abs(diagA[i]);
 
     //local entries
-    dlong Jstart = A->diag.rowStarts[i];
-    dlong Jend   = A->diag.rowStarts[i+1];
+    dlong Jstart = A.diag.rowStarts[i];
+    dlong Jend   = A.diag.rowStarts[i+1];
     for(dlong jj= Jstart; jj<Jend; jj++){
-      const dlong col = A->diag.cols[jj];
+      const dlong col = A.diag.cols[jj];
       if (col==i) continue;
 
-      const dfloat Ajj = fabs(diagA[col]);
+      const dfloat Ajj = std::abs(diagA[col]);
 
-      if(fabs(A->diag.vals[jj]) > theta*(sqrt(Aii*Ajj)))
+      if(std::abs(A.diag.vals[jj]) > theta*(sqrt(Aii*Ajj)))
         strong_per_row++;
     }
     //non-local entries
-    Jstart = A->offd.rowStarts[i];
-    Jend   = A->offd.rowStarts[i+1];
+    Jstart = A.offd.rowStarts[i];
+    Jend   = A.offd.rowStarts[i+1];
     for(dlong jj= Jstart; jj<Jend; jj++){
-      const dlong col = A->offd.cols[jj];
-      const dfloat Ajj = fabs(diagA[col]);
+      const dlong col = A.offd.cols[jj];
+      const dfloat Ajj = std::abs(diagA[col]);
 
-      if(fabs(A->offd.vals[jj]) > theta*(sqrt(Aii*Ajj)))
+      if(std::abs(A.offd.vals[jj]) > theta*(sqrt(Aii*Ajj)))
         strong_per_row++;
     }
 
-    C->rowStarts[i+1] = strong_per_row;
+    C.rowStarts[i+1] = strong_per_row;
   }
 
   // cumulative sum
+  C.rowStarts[0] = 0;
   for(dlong i=1; i<N+1 ; i++) {
-    C->rowStarts[i] += C->rowStarts[i-1];
+    C.rowStarts[i] += C.rowStarts[i-1];
   }
-  C->nnz = C->rowStarts[N];
-  C->cols = (dlong *) malloc(C->nnz*sizeof(dlong));
+  C.nnz = C.rowStarts[N];
+  C.cols.malloc(C.nnz);
 
 
   // fill in the columns for strong connections
   // #pragma omp parallel for
   for(dlong i=0; i<N; i++){
-    const dfloat Aii = fabs(diagA[i]);
+    const dfloat Aii = std::abs(diagA[i]);
 
-    dlong counter = C->rowStarts[i];
+    dlong counter = C.rowStarts[i];
 
     //local entries
-    dlong Jstart = A->diag.rowStarts[i];
-    dlong Jend   = A->diag.rowStarts[i+1];
+    dlong Jstart = A.diag.rowStarts[i];
+    dlong Jend   = A.diag.rowStarts[i+1];
     for(dlong jj= Jstart; jj<Jend; jj++){
-      const dlong col = A->diag.cols[jj];
+      const dlong col = A.diag.cols[jj];
       if (col==i) {
-        C->cols[counter++] = col;// diag entry
+        C.cols[counter++] = col;// diag entry
         continue;
       }
 
-      const dfloat Ajj = fabs(diagA[col]);
+      const dfloat Ajj = std::abs(diagA[col]);
 
-      if(fabs(A->diag.vals[jj]) > theta*(sqrt(Aii*Ajj)))
-        C->cols[counter++] = col;
+      if(std::abs(A.diag.vals[jj]) > theta*(sqrt(Aii*Ajj)))
+        C.cols[counter++] = col;
     }
     //non-local entries
-    Jstart = A->offd.rowStarts[i];
-    Jend   = A->offd.rowStarts[i+1];
+    Jstart = A.offd.rowStarts[i];
+    Jend   = A.offd.rowStarts[i+1];
     for(dlong jj= Jstart; jj<Jend; jj++){
-      const dlong col = A->offd.cols[jj];
+      const dlong col = A.offd.cols[jj];
 
-      const dfloat Ajj = fabs(diagA[col]);
+      const dfloat Ajj = std::abs(diagA[col]);
 
-      if(fabs(A->offd.vals[jj]) > theta*(sqrt(Aii*Ajj)))
-        C->cols[counter++] = col;
+      if(std::abs(A.offd.vals[jj]) > theta*(sqrt(Aii*Ajj)))
+        C.cols[counter++] = col;
     }
   }
 
   return C;
 }
 
-} //namespace parAlmond
\ No newline at end of file
+} //namespace parAlmond
+
+} //namespace libp
diff --git a/libs/parAlmond/parAlmondTentativeProlongator.cpp b/libs/parAlmond/parAlmondTentativeProlongator.cpp
index 8d93caba9..8632a3399 100644
--- a/libs/parAlmond/parAlmondTentativeProlongator.cpp
+++ b/libs/parAlmond/parAlmondTentativeProlongator.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,29 +27,28 @@ SOFTWARE.
 #include "parAlmond.hpp"
 #include "parAlmond/parAlmondAMGSetup.hpp"
 
+namespace libp {
+
 namespace parAlmond {
 
-parCSR *tentativeProlongator(parCSR *A, hlong *FineToCoarse,
-                            hlong *globalAggStarts, dfloat *null){
+parCSR tentativeProlongator(parCSR& A, memory<hlong> FineToCoarse,
+                            memory<hlong> globalAggStarts, memory<dfloat> null){
 
-  int rank, size;
-  MPI_Comm_rank(A->comm, &rank);
-  MPI_Comm_size(A->comm, &size);
+  int rank = A.comm.rank();
+  // int size = A.comm.size();
 
-  const dlong NCoarse = (dlong) (globalAggStarts[rank+1]-globalAggStarts[rank]); //local num agg
+  const dlong NCoarse = static_cast<dlong>(globalAggStarts[rank+1]-globalAggStarts[rank]); //local num agg
 
-  parCOO cooP(A->platform, A->comm);
+  parCOO cooP(A.platform, A.comm);
 
   //copy global partition
-  cooP.globalRowStarts = (hlong *) calloc(size+1,sizeof(hlong));
-  cooP.globalColStarts = (hlong *) calloc(size+1,sizeof(hlong));
-  memcpy(cooP.globalRowStarts, A->globalRowStarts, (size+1)*sizeof(hlong));
-  memcpy(cooP.globalColStarts, globalAggStarts,   (size+1)*sizeof(hlong));
+  cooP.globalRowStarts = A.globalRowStarts;
+  cooP.globalColStarts = globalAggStarts;
 
-  const hlong globalRowOffset = A->globalRowStarts[rank];
+  const hlong globalRowOffset = A.globalRowStarts[rank];
 
-  cooP.nnz = A->Nrows;
-  cooP.entries = (parCOO::nonZero_t *) malloc(cooP.nnz*sizeof(parCOO::nonZero_t));
+  cooP.nnz = A.Nrows;
+  cooP.entries.malloc(cooP.nnz);
 
   for(dlong n=0; n<cooP.nnz; n++) {
     cooP.entries[n].row = n + globalRowOffset;
@@ -58,40 +57,42 @@ parCSR *tentativeProlongator(parCSR *A, hlong *FineToCoarse,
   }
 
   //build P from coo matrix
-  parCSR* P = new parCSR(cooP);
+  parCSR P(cooP);
 
   // normalize the columns of P and fill null with coarse null vector
 
   //check size. If this ever triggers, we'll have to implement a re-alloc of null
-  if (P->Ncols > A->Ncols)
-    LIBP_ABORT(string("Size of Coarse nullvector is too large, need to re-alloc"))
+  LIBP_ABORT("Size of Coarse nullvector is too large, need to re-alloc",
+             P.Ncols > A.Ncols);
 
   //set coarse null to 0
-  for(dlong i=0; i<P->Ncols; i++) null[i] = 0.0;
+  for(dlong i=0; i<P.Ncols; i++) null[i] = 0.0;
 
   //add local nonzeros
-  for(dlong i=0; i<P->diag.nnz; i++)
-    null[P->diag.cols[i]] += P->diag.vals[i] * P->diag.vals[i];
+  for(dlong i=0; i<P.diag.nnz; i++)
+    null[P.diag.cols[i]] += P.diag.vals[i] * P.diag.vals[i];
 
   //add nonlocal nonzeros
-  for(dlong i=0; i<P->offd.nnz; i++)
-    null[P->offd.cols[i]] += P->offd.vals[i] * P->offd.vals[i];
+  for(dlong i=0; i<P.offd.nnz; i++)
+    null[P.offd.cols[i]] += P.offd.vals[i] * P.offd.vals[i];
 
   //add the halo values to their origins
-  P->halo->Combine(null, 1, ogs_dfloat);
+  P.halo.Combine(null, 1);
 
   for(dlong i=0; i<NCoarse; i++)
     null[i] = sqrt(null[i]);
 
   //share the results
-  P->halo->Exchange(null, 1, ogs_dfloat);
+  P.halo.Exchange(null, 1);
 
-  for(dlong i=0; i<P->diag.nnz; i++)
-    P->diag.vals[i] /= null[P->diag.cols[i]];
-  for(dlong i=0; i<P->offd.nnz; i++)
-    P->offd.vals[i] /= null[P->offd.cols[i]];
+  for(dlong i=0; i<P.diag.nnz; i++)
+    P.diag.vals[i] /= null[P.diag.cols[i]];
+  for(dlong i=0; i<P.offd.nnz; i++)
+    P.offd.vals[i] /= null[P.offd.cols[i]];
 
   return P;
 }
 
-} //namespace parAlmond
\ No newline at end of file
+} //namespace parAlmond
+
+} //namespace libp
diff --git a/libs/parAlmond/parAlmondTranspose.cpp b/libs/parAlmond/parAlmondTranspose.cpp
index 47f016d56..3e93ff770 100644
--- a/libs/parAlmond/parAlmondTranspose.cpp
+++ b/libs/parAlmond/parAlmondTranspose.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,30 +27,30 @@ SOFTWARE.
 #include "parAlmond.hpp"
 #include "parAlmond/parAlmondAMGSetup.hpp"
 
+namespace libp {
+
 namespace parAlmond {
 
-parCSR *transpose(parCSR *A){
+parCSR transpose(parCSR& A){
 
   // MPI info
-  int rank, size;
-  MPI_Comm_rank(A->comm, &rank);
-  MPI_Comm_size(A->comm, &size);
+  int rank = A.comm.rank();
+  int size = A.comm.size();
 
   // copy data from nonlocal entries into send buffer
-  parCOO::nonZero_t *sendNonZeros = (parCOO::nonZero_t *)
-                                    calloc(A->offd.nnz, sizeof(parCOO::nonZero_t));
-  for(dlong i=0;i<A->offd.nzRows;++i){
-    const hlong row = A->offd.rows[i] + A->globalRowStarts[rank]; //global ids
-    for (dlong j=A->offd.mRowStarts[i];j<A->offd.mRowStarts[i+1];j++) {
-      const hlong col =  A->colMap[A->offd.cols[j]]; //global ids
+  memory<parCOO::nonZero_t> sendNonZeros(A.offd.nnz);
+  for(dlong i=0;i<A.offd.nzRows;++i){
+    const hlong row = A.offd.rows[i] + A.globalRowStarts[rank]; //global ids
+    for (dlong j=A.offd.mRowStarts[i];j<A.offd.mRowStarts[i+1];j++) {
+      const hlong col =  A.colMap[A.offd.cols[j]]; //global ids
       sendNonZeros[j].row = col;
       sendNonZeros[j].col = row;
-      sendNonZeros[j].val = A->offd.vals[j];
+      sendNonZeros[j].val = A.offd.vals[j];
     }
   }
 
   //sort by destination row
-  std::sort(sendNonZeros, sendNonZeros+A->offd.nnz,
+  std::sort(sendNonZeros.ptr(), sendNonZeros.ptr()+A.offd.nnz,
             [](const parCOO::nonZero_t& a, const parCOO::nonZero_t& b) {
               if (a.row < b.row) return true;
               if (a.row > b.row) return false;
@@ -59,21 +59,22 @@ parCSR *transpose(parCSR *A){
             });
 
   //count number of non-zeros we're sending
-  int *sendCounts = (int*) calloc(size, sizeof(int));
-  int *recvCounts = (int*) calloc(size, sizeof(int));
-  int *sendOffsets = (int*) calloc(size+1, sizeof(int));
-  int *recvOffsets = (int*) calloc(size+1, sizeof(int));
+  memory<int> sendCounts(size, 0);
+  memory<int> recvCounts(size);
+  memory<int> sendOffsets(size+1);
+  memory<int> recvOffsets(size+1);
 
   int r=0;
-  for (dlong n=0;n<A->offd.nnz;n++) {
+  for (dlong n=0;n<A.offd.nnz;n++) {
     dlong row = sendNonZeros[n].row;
-    while(row>=A->globalColStarts[r+1]) r++;
+    while(row>=A.globalColStarts[r+1]) r++;
     sendCounts[r]++;
   }
 
-  MPI_Alltoall(sendCounts, 1, MPI_INT,
-               recvCounts, 1, MPI_INT, A->comm);
+  A.comm.Alltoall(sendCounts, recvCounts);
 
+  sendOffsets[0] = 0;
+  recvOffsets[0] = 0;
   for (r=0;r<size;r++) {
     sendOffsets[r+1] = sendOffsets[r]+sendCounts[r];
     recvOffsets[r+1] = recvOffsets[r]+recvCounts[r];
@@ -81,44 +82,33 @@ parCSR *transpose(parCSR *A){
   dlong offdnnz = recvOffsets[size]; //total offd nonzeros
 
 
-  parCOO cooAt(A->platform, A->comm);
+  parCOO cooAt(A.platform, A.comm);
 
   //copy global partition
-  cooAt.globalRowStarts = (hlong *) calloc(size+1,sizeof(hlong));
-  cooAt.globalColStarts = (hlong *) calloc(size+1,sizeof(hlong));
-  memcpy(cooAt.globalRowStarts, A->globalColStarts, (size+1)*sizeof(hlong));
-  memcpy(cooAt.globalColStarts, A->globalRowStarts, (size+1)*sizeof(hlong));
+  cooAt.globalRowStarts = A.globalColStarts;
+  cooAt.globalColStarts = A.globalRowStarts;
 
-  cooAt.nnz = A->diag.nnz+offdnnz;
-  cooAt.entries = (parCOO::nonZero_t *) calloc(cooAt.nnz, sizeof(parCOO::nonZero_t));
+  cooAt.nnz = A.diag.nnz+offdnnz;
+  cooAt.entries.malloc(cooAt.nnz);
 
   //fill local nonzeros
-  for(dlong i=0; i<A->Nrows; i++){
-    const dlong Jstart = A->diag.rowStarts[i];
-    const dlong Jend   = A->diag.rowStarts[i+1];
+  for(dlong i=0; i<A.Nrows; i++){
+    const dlong Jstart = A.diag.rowStarts[i];
+    const dlong Jend   = A.diag.rowStarts[i+1];
 
     for(dlong jj=Jstart; jj<Jend; jj++){
-      cooAt.entries[jj].row = A->diag.cols[jj] + A->globalColStarts[rank];
-      cooAt.entries[jj].col = i + A->globalRowStarts[rank];
-      cooAt.entries[jj].val = A->diag.vals[jj];
+      cooAt.entries[jj].row = A.diag.cols[jj] + A.globalColStarts[rank];
+      cooAt.entries[jj].col = i + A.globalRowStarts[rank];
+      cooAt.entries[jj].val = A.diag.vals[jj];
     }
   }
 
   // receive non-local nonzeros
-  MPI_Alltoallv(sendNonZeros,              sendCounts, sendOffsets, MPI_NONZERO_T,
-                cooAt.entries+A->diag.nnz, recvCounts, recvOffsets, MPI_NONZERO_T,
-                A->comm);
-
-  //clean up
-  MPI_Barrier(A->comm);
-  free(sendNonZeros);
-  free(sendCounts);
-  free(recvCounts);
-  free(sendOffsets);
-  free(recvOffsets);
+  A.comm.Alltoallv(sendNonZeros,             sendCounts, sendOffsets,
+                   cooAt.entries+A.diag.nnz, recvCounts, recvOffsets);
 
   //sort by row
-  std::sort(cooAt.entries, cooAt.entries+cooAt.nnz,
+  std::sort(cooAt.entries.ptr(), cooAt.entries.ptr()+cooAt.nnz,
             [](const parCOO::nonZero_t& a, const parCOO::nonZero_t& b) {
               if (a.row < b.row) return true;
               if (a.row > b.row) return false;
@@ -126,7 +116,9 @@ parCSR *transpose(parCSR *A){
               return a.col < b.col;
             });
 
-  return new parCSR(cooAt);
+  return parCSR(cooAt);
 }
 
-} //namespace parAlmond
\ No newline at end of file
+} //namespace parAlmond
+
+} //namespace libp
diff --git a/libs/parAlmond/parAlmondVcycle.cpp b/libs/parAlmond/parAlmondVcycle.cpp
index 8c232d945..d7227ec41 100644
--- a/libs/parAlmond/parAlmondVcycle.cpp
+++ b/libs/parAlmond/parAlmondVcycle.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,11 +25,13 @@ SOFTWARE.
 */
 
 #include "parAlmond.hpp"
-#include "parAlmond/parAlmondMultigrid.hpp"
+#include "parAlmond/parAlmondCoarseSolver.hpp"
+
+namespace libp {
 
 namespace parAlmond {
 
-void multigrid_t::vcycle(const int k, occa::memory& o_RHS, occa::memory& o_X){
+void multigrid_t::vcycle(const int k, deviceMemory<dfloat>& o_RHS, deviceMemory<dfloat>& o_X){
 
   //check for base level
   if(k==baseLevel) {
@@ -37,24 +39,26 @@ void multigrid_t::vcycle(const int k, occa::memory& o_RHS, occa::memory& o_X){
     return;
   }
 
-  multigridLevel *level  = levels[k];
-  occa::memory& o_RHSC = o_rhs[k+1];
-  occa::memory& o_XC   = o_x[k+1];
-  occa::memory& o_RES   = o_scratch;
+  multigridLevel& level = *levels[k];
+  deviceMemory<dfloat>& o_RHSC = o_rhs[k+1];
+  deviceMemory<dfloat>& o_XC   = o_x[k+1];
+  deviceMemory<dfloat>& o_RES  = o_scratch;
 
   //apply smoother to x and then compute res = rhs-Ax
-  level->smooth(o_RHS, o_X, true);
-  level->residual(o_RHS, o_X, o_RES);
+  level.smooth(o_RHS, o_X, true);
+  level.residual(o_RHS, o_X, o_RES);
 
   // rhsC = P^T res
-  level->coarsen(o_RES, o_RHSC);
+  level.coarsen(o_RES, o_RHSC);
 
   vcycle(k+1, o_RHSC, o_XC);
 
   // x = x + P xC
-  level->prolongate(o_XC, o_X);
+  level.prolongate(o_XC, o_X);
 
-  level->smooth(o_RHS, o_X, false);
+  level.smooth(o_RHS, o_X, false);
 }
 
 } //namespace parAlmond
+
+} //namespace libp
diff --git a/libs/parAlmond/parAlmondparCSR.cpp b/libs/parAlmond/parAlmondparCSR.cpp
index 9339ba003..e579fbbf1 100644
--- a/libs/parAlmond/parAlmondparCSR.cpp
+++ b/libs/parAlmond/parAlmondparCSR.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -28,6 +28,8 @@ SOFTWARE.
 #include "parAlmond/parAlmondparCSR.hpp"
 #include "parAlmond/parAlmondKernels.hpp"
 
+namespace libp {
+
 namespace parAlmond {
 
 //------------------------------------------------------------------------
@@ -36,8 +38,10 @@ namespace parAlmond {
 //
 //------------------------------------------------------------------------
 
-void parCSR::SpMV(const dfloat alpha, dfloat *x,
-                  const dfloat beta, dfloat *y) {
+void parCSR::SpMV(const dfloat alpha, memory<dfloat>& x,
+                  const dfloat beta, memory<dfloat>& y) {
+
+  halo.ExchangeStart(x, 1);
 
   // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
   // #pragma omp parallel for
@@ -52,7 +56,7 @@ void parCSR::SpMV(const dfloat alpha, dfloat *x,
       y[i] = alpha*result;
   }
 
-  halo->Exchange(x, 1, ogs_dfloat);
+  halo.ExchangeFinish(x, 1);
 
   // #pragma omp parallel for
   for(dlong i=0; i<offd.nzRows; i++){ //local
@@ -65,8 +69,10 @@ void parCSR::SpMV(const dfloat alpha, dfloat *x,
   }
 }
 
-void parCSR::SpMV(const dfloat alpha, dfloat *x,
-                  const dfloat beta, const dfloat *y, dfloat *z) {
+void parCSR::SpMV(const dfloat alpha, memory<dfloat>& x,
+                  const dfloat beta, const memory<dfloat>& y, memory<dfloat>& z) {
+
+  halo.ExchangeStart(x, 1);
 
   // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
   // #pragma omp parallel for
@@ -78,7 +84,7 @@ void parCSR::SpMV(const dfloat alpha, dfloat *x,
     z[i] = alpha*result + beta*y[i];
   }
 
-  halo->Exchange(x, 1, ogs_dfloat);
+  halo.ExchangeFinish(x, 1);
 
   for(dlong i=0; i<offd.nzRows; i++){ //local
     const dlong row = offd.rows[i];
@@ -90,10 +96,10 @@ void parCSR::SpMV(const dfloat alpha, dfloat *x,
   }
 }
 
-void parCSR::SpMV(const dfloat alpha, occa::memory& o_x, const dfloat beta,
-                  occa::memory& o_y) {
+void parCSR::SpMV(const dfloat alpha, deviceMemory<dfloat>& o_x, const dfloat beta,
+                  deviceMemory<dfloat>& o_y) {
 
-  halo->ExchangeStart(o_x, 1, ogs_dfloat);
+  halo.ExchangeStart(o_x, 1);
 
   // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
   if (diag.NrowBlocks)
@@ -102,7 +108,7 @@ void parCSR::SpMV(const dfloat alpha, occa::memory& o_x, const dfloat beta,
                    diag.o_cols, diag.o_vals,
                    o_x, o_y);
 
-  halo->ExchangeFinish(o_x, 1, ogs_dfloat);
+  halo.ExchangeFinish(o_x, 1);
 
   const dfloat one = 1.0;
   if (offd.NrowBlocks)
@@ -112,10 +118,10 @@ void parCSR::SpMV(const dfloat alpha, occa::memory& o_x, const dfloat beta,
                    o_x, o_y);
 }
 
-void parCSR::SpMV(const dfloat alpha, occa::memory& o_x, const dfloat beta,
-                  occa::memory& o_y, occa::memory& o_z) {
+void parCSR::SpMV(const dfloat alpha, deviceMemory<dfloat>& o_x, const dfloat beta,
+                  deviceMemory<dfloat>& o_y, deviceMemory<dfloat>& o_z) {
 
-  halo->ExchangeStart(o_x, 1, ogs_dfloat);
+  halo.ExchangeStart(o_x, 1);
 
   // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
   if (diag.NrowBlocks)
@@ -124,7 +130,7 @@ void parCSR::SpMV(const dfloat alpha, occa::memory& o_x, const dfloat beta,
                    diag.o_cols, diag.o_vals,
                    o_x, o_y, o_z);
 
-  halo->ExchangeFinish(o_x, 1, ogs_dfloat);
+  halo.ExchangeFinish(o_x, 1);
 
   const dfloat one = 1.0;
   if (offd.NrowBlocks)
@@ -146,25 +152,21 @@ parCSR::parCSR(parCOO& A):       // number of nonzeros on this rank
   platform(A.platform),
   comm(A.comm) {
 
-  int rank;
-  int size;
-  MPI_Comm_rank(comm, &rank);
-  MPI_Comm_size(comm, &size);
+  int rank = comm.rank();
+  // int size = comm.size();
 
   //copy global partition
-  globalRowStarts = (hlong *) calloc(size+1,sizeof(hlong));
-  globalColStarts = (hlong *) calloc(size+1,sizeof(hlong));
-  memcpy(globalRowStarts, A.globalRowStarts, (size+1)*sizeof(hlong));
-  memcpy(globalColStarts, A.globalColStarts, (size+1)*sizeof(hlong));
+  globalRowStarts = A.globalRowStarts;
+  globalColStarts = A.globalColStarts;
 
   const hlong globalRowOffset = globalRowStarts[rank];
   const hlong globalColOffset = globalColStarts[rank];
 
-  Nrows = (dlong)(globalRowStarts[rank+1]-globalRowStarts[rank]);
-  Ncols = (dlong)(globalColStarts[rank+1]-globalColStarts[rank]);
+  Nrows = static_cast<dlong>(globalRowStarts[rank+1]-globalRowStarts[rank]);
+  Ncols = static_cast<dlong>(globalColStarts[rank+1]-globalColStarts[rank]);
 
-  diag.rowStarts = (dlong *) calloc(Nrows+1, sizeof(dlong));
-  offd.rowStarts = (dlong *) calloc(Nrows+1, sizeof(dlong));
+  diag.rowStarts.malloc(Nrows+1, 0);
+  offd.rowStarts.malloc(Nrows+1, 0);
 
   //count the entries in each row
   for (dlong n=0;n<A.nnz;n++) {
@@ -182,11 +184,12 @@ parCSR::parCSR(parCOO& A):       // number of nonzeros on this rank
   for(dlong i=0; i<Nrows; i++)
     if (offd.rowStarts[i+1]>0) offd.nzRows++;
 
-  offd.rows       = (dlong *) calloc(offd.nzRows, sizeof(dlong));
-  offd.mRowStarts = (dlong *) calloc(offd.nzRows+1, sizeof(dlong));
+  offd.rows.malloc(offd.nzRows);
+  offd.mRowStarts.malloc(offd.nzRows+1);
 
   // cumulative sum
   dlong cnt=0;
+  offd.mRowStarts[0] = 0;
   for(dlong i=0; i<Nrows; i++) {
     if (offd.rowStarts[i+1]>0) {
       offd.rows[cnt] = i; //record row id
@@ -201,7 +204,7 @@ parCSR::parCSR(parCOO& A):       // number of nonzeros on this rank
 
   // Halo setup
   cnt=0;
-  hlong *colIds = (hlong *) malloc(offd.nnz*sizeof(hlong));
+  memory<hlong> colIds(offd.nnz);
   for (dlong n=0;n<A.nnz;n++) {
     if ( (A.entries[n].col < globalColOffset)
       || (A.entries[n].col > globalColOffset+Ncols-1))
@@ -210,10 +213,10 @@ parCSR::parCSR(parCOO& A):       // number of nonzeros on this rank
   haloSetup(colIds); //setup halo, and transform colIds to a local indexing
 
   //fill the CSR matrices
-  diag.cols = (dlong *)  calloc(diag.nnz, sizeof(dlong));
-  offd.cols = (dlong *)  calloc(offd.nnz, sizeof(dlong));
-  diag.vals = (pfloat *) calloc(diag.nnz, sizeof(pfloat));
-  offd.vals = (pfloat *) calloc(offd.nnz, sizeof(pfloat));
+  diag.cols.malloc(diag.nnz);
+  offd.cols.malloc(offd.nnz);
+  diag.vals.malloc(diag.nnz);
+  offd.vals.malloc(offd.nnz);
   dlong diagCnt = 0;
   dlong offdCnt = 0;
   for (dlong n=0;n<A.nnz;n++) {
@@ -223,12 +226,11 @@ parCSR::parCSR(parCOO& A):       // number of nonzeros on this rank
       offd.vals[offdCnt] = A.entries[n].val;
       offdCnt++;
     } else {
-      diag.cols[diagCnt] = (dlong) (A.entries[n].col - globalColOffset);
+      diag.cols[diagCnt] = static_cast<dlong>(A.entries[n].col - globalColOffset);
       diag.vals[diagCnt] = A.entries[n].val;
       diagCnt++;
     }
   }
-  free(colIds);
 }
 
 //------------------------------------------------------------------------
@@ -247,15 +249,14 @@ typedef struct {
 } parallelId_t;
 
 
-void parCSR::haloSetup(hlong *colIds) {
+void parCSR::haloSetup(memory<hlong> colIds) {
 
-  int rank;
-  MPI_Comm_rank(comm, &rank);
+  int rank = comm.rank();
 
   const hlong globalOffset = globalColStarts[rank];
 
   //collect the unique nonlocal column ids
-  parallelId_t*  parIds = (parallelId_t*) malloc(offd.nnz*sizeof(parallelId_t));
+  memory<parallelId_t> parIds(offd.nnz);
 
   for (dlong n=0;n<offd.nnz;n++) {
     parIds[n].localId  = n;
@@ -263,7 +264,7 @@ void parCSR::haloSetup(hlong *colIds) {
   }
 
   //sort by global index
-  std::sort(parIds, parIds+offd.nnz,
+  std::sort(parIds.ptr(), parIds.ptr()+offd.nnz,
             [](const parallelId_t& a, const parallelId_t& b) {
               if(a.globalId < b.globalId) return true;
               if(a.globalId > b.globalId) return false;
@@ -283,7 +284,7 @@ void parCSR::haloSetup(hlong *colIds) {
   if(offd.nnz) Noffdcols++;
 
   //record the global ids of the unique columns
-  hlong *offdcols = (hlong *) malloc(Noffdcols*sizeof(hlong));
+  memory<hlong> offdcols(Noffdcols);
   Noffdcols = 0;
   if(offd.nnz) offdcols[Noffdcols++] = parIds[0].globalId;
   for (dlong n=1;n<offd.nnz;n++)
@@ -291,7 +292,7 @@ void parCSR::haloSetup(hlong *colIds) {
       offdcols[Noffdcols++] = parIds[n].globalId;
 
   //sort back to local order
-  std::sort(parIds, parIds+offd.nnz,
+  std::sort(parIds.ptr(), parIds.ptr()+offd.nnz,
             [](const parallelId_t& a, const parallelId_t& b) {
               if(a.localId < b.localId) return true;
               if(a.localId > b.localId) return false;
@@ -304,22 +305,20 @@ void parCSR::haloSetup(hlong *colIds) {
   Ncols += Noffdcols;
 
   //make an array of all the column ids required on this rank (local first)
-  colMap = (hlong*) malloc(Ncols*sizeof(hlong));
+  colMap.malloc(Ncols);
   for (dlong n=0; n<NlocalCols; n++)      colMap[n] = n+globalOffset+1; //local rows
   for (dlong n=NlocalCols; n<Ncols; n++)  colMap[n] = -(offdcols[n-NlocalCols]+1);    //nonlocal rows
 
   //make a halo exchange to share column entries and an ogs for gsops accross columns
   int verbose = 0;
-  halo = halo_t::Setup(Ncols, colMap, comm, verbose, platform);
+  halo.Setup(Ncols, colMap, comm, ogs::Auto, verbose, platform);
 
   //shift back to 0-indexed
-  for (dlong n=0; n<Ncols; n++) colMap[n]=abs(colMap[n])-1;
+  for (dlong n=0; n<Ncols; n++) colMap[n]=std::abs(colMap[n])-1;
 
   //update column numbering
   for (dlong n=0;n<offd.nnz;n++)
     colIds[n] = NlocalCols + parIds[n].newId;
-
-  free(parIds);
 }
 
 //------------------------------------------------------------------------
@@ -330,8 +329,8 @@ void parCSR::haloSetup(hlong *colIds) {
 
 void parCSR::diagSetup() {
   //fill the CSR matrices
-  diagA   = (dfloat *) calloc(Ncols, sizeof(dfloat));
-  diagInv = (dfloat *) calloc(Ncols, sizeof(dfloat));
+  diagA.malloc(Ncols);
+  diagInv.malloc(Ncols);
 
   for (dlong i=0;i<Nrows;i++) {
     const dlong start = diag.rowStarts[i];
@@ -345,7 +344,7 @@ void parCSR::diagSetup() {
   }
 
   //fill the halo region
-  halo->Exchange(diagA, 1, ogs_dfloat);
+  halo.Exchange(diagA, 1);
 
   //compute the inverse diagonal
   for (dlong n=0;n<Nrows;n++)
@@ -355,31 +354,6 @@ void parCSR::diagSetup() {
   rho = rhoDinvA();
 }
 
-parCSR::~parCSR() {
-  if (diag.blockRowStarts) free(diag.blockRowStarts);
-  if (diag.rowStarts) free(diag.rowStarts);
-  if (diag.cols) free(diag.cols);
-  if (diag.vals) free(diag.vals);
-
-  if (offd.blockRowStarts) free(offd.blockRowStarts);
-  if (offd.rowStarts) free(offd.rowStarts);
-  if (offd.mRowStarts) free(offd.mRowStarts);
-  if (offd.rows) free(offd.rows);
-  if (offd.cols) free(offd.cols);
-  if (offd.vals) free(offd.vals);
-
-  if (diagA) free(diagA);
-  if (diagInv) free(diagInv);
-
-  if (o_diagA.size()) o_diagA.free();
-  if (o_diagInv.size()) o_diagInv.free();
-
-  if (globalRowStarts) free(globalRowStarts);
-  if (globalColStarts) free(globalColStarts);
-  if (colMap) free(colMap);
-
-  if (halo)   halo->Free();
-}
 
 //------------------------------------------------------------------------
 //
@@ -389,44 +363,44 @@ parCSR::~parCSR() {
 
 dfloat parCSR::rhoDinvA(){
 
-  int size;
-  MPI_Comm_size(comm, &size);
+  int size = comm.size();
 
   int k = 10;
 
   hlong Ntotal = globalRowStarts[size];
-  if(k > Ntotal) k = (int) Ntotal;
+  if(k > Ntotal) k = static_cast<int>(Ntotal);
 
   // do an arnoldi
 
   // allocate memory for Hessenberg matrix
-  double *H = (double *) calloc(k*k,sizeof(double));
+  memory<double> H(k*k, 0.0);
 
   // allocate memory for basis
-  dfloat **V = (dfloat **) calloc(k+1, sizeof(dfloat *));
-  dfloat *Vx = (dfloat *) calloc(Ncols, sizeof(dfloat));
+  memory<memory<dfloat>> V(k+1);
+  memory<dfloat> Vx(Ncols);
 
-  for(int i=0; i<=k; i++)
-    V[i] = (dfloat *) calloc(Nrows, sizeof(dfloat));
+  for(int i=0; i<=k; i++) {
+    V[i].malloc(Nrows);
+  }
 
   // generate a random vector for initial basis vector
   for(dlong n=0; n<Nrows; n++) Vx[n] = (dfloat) drand48();
 
   // dfloat norm_vo = vectorNorm(Nrows,Vx, comm);
-  dfloat norm_vo=0.0, gnorm_vo=0.0;
+  dfloat norm_vo=0.0;
   for(dlong n=0; n<Nrows; n++) norm_vo += Vx[n]*Vx[n];
-  MPI_Allreduce(&norm_vo, &gnorm_vo, 1, MPI_DFLOAT, MPI_SUM, comm);
-  norm_vo = sqrt(gnorm_vo);
+  comm.Allreduce(norm_vo);
+  norm_vo = sqrt(norm_vo);
 
   // vectorScale(Nrows, 1.0/norm_vo, Vx);
   for(dlong n=0; n<Nrows; n++) Vx[n] *= (1.0/norm_vo);
 
   //V[0] = Vx
-  memcpy(V[0], Vx, Nrows*sizeof(dfloat));
+  V[0].copyFrom(Vx, Nrows);
 
   for(int j=0; j<k; j++){
     //Vx = V[j]
-    memcpy(Vx, V[j], Nrows*sizeof(dfloat));
+    Vx.copyFrom(V[j], Nrows);
 
     // v[j+1] = invD*(A*v[j])
     SpMV(1.0, Vx, 0., V[j+1]);
@@ -437,36 +411,36 @@ dfloat parCSR::rhoDinvA(){
     for(int i=0; i<=j; i++){
       // H(i,j) = v[i]'*A*v[j]
       // dfloat hij = vectorInnerProd(Nrows, V[i], V[j+1],comm);
-      dfloat local_hij=0.0, hij=0.0;
-      for(dlong n=0; n<Nrows; n++) local_hij += V[i][n]*V[j+1][n];
-      MPI_Allreduce(&local_hij, &hij, 1, MPI_DFLOAT, MPI_SUM, comm);
+      dfloat hij=0.0;
+      for(dlong n=0; n<Nrows; n++) hij += V[i][n]*V[j+1][n];
+      comm.Allreduce(hij);
 
       // v[j+1] = v[j+1] - hij*v[i]
       // vectorAdd(Nrows,-hij, V[i], 1.0, V[j+1]);
       for(dlong n=0; n<Nrows; n++) V[j+1][n] += -hij*V[i][n];
 
-      H[i + j*k] = (double) hij;
+      H[i + j*k] = static_cast<double>(hij);
     }
 
     if(j+1 < k){
 
       // dfloat norm_vj = vectorNorm(Nrows,V[j+1],comm);
-      dfloat norm_vj=0.0, gnorm_vj=0.0;
+      dfloat norm_vj=0.0;
       for(dlong n=0; n<Nrows; n++) norm_vj += V[j+1][n]*V[j+1][n];
-      MPI_Allreduce(&norm_vj, &gnorm_vj, 1, MPI_DFLOAT, MPI_SUM, comm);
-      norm_vj = sqrt(gnorm_vj);
+      comm.Allreduce(norm_vj);
+      norm_vj = sqrt(norm_vj);
 
-      H[j+1+ j*k] = (double) norm_vj;
+      H[j+1+ j*k] = static_cast<double>(norm_vj);
 
       // vectorScale(Nrows, 1./H[j+1 + j*k], V[j+1]);
       for(dlong n=0; n<Nrows; n++) V[j+1][n] *= (1./H[j+1 + j*k]);
     }
   }
 
-  double *WR = (double *) calloc(k,sizeof(double));
-  double *WI = (double *) calloc(k,sizeof(double));
+  memory<double> WR(k);
+  memory<double> WI(k);
 
-  matrixEigenValues(k, H, WR, WI);
+  linAlg_t::matrixEigenValues(k, H, WR, WI);
 
   double RHO = 0.;
 
@@ -478,15 +452,6 @@ dfloat parCSR::rhoDinvA(){
     }
   }
 
-  free(H);
-  free(WR);
-  free(WI);
-
-  // free memory
-  for(int i=0; i<=k; i++) free(V[i]);
-  free(Vx);
-  free(V);
-
   // printf("weight = %g \n", RHO);
 
   return RHO;
@@ -496,7 +461,7 @@ void parCSR::syncToDevice() {
 
   if (Nrows) {
     //transfer matrix data
-    diag.o_rowStarts = platform.malloc((Nrows+1)*sizeof(dlong), diag.rowStarts);
+    diag.o_rowStarts = platform.malloc<dlong>(diag.rowStarts);
 
     diag.NrowBlocks=0;
     if (diag.nnz) {
@@ -506,12 +471,9 @@ void parCSR::syncToDevice() {
       for (dlong i=0;i<Nrows;i++) {
         dlong rowSize = diag.rowStarts[i+1]-diag.rowStarts[i];
 
-        if (rowSize > parAlmond::NonzerosPerBlock) {
-          //this row is pathalogically big. We can't currently run this
-          stringstream ss;
-          ss << "Multiplicity of row: " << i << " is " << rowSize << " in parAlmond::parCSR setup and is too large.";
-          LIBP_ABORT(ss.str())
-        }
+        //this may be pathalogically big. We can't currently run this
+        LIBP_ABORT("Multiplicity of row: " << i << " is " << rowSize << " in parAlmond::parCSR setup and is too large.",
+                   rowSize > parAlmond::NonzerosPerBlock);
 
         if (blockSum+rowSize > parAlmond::NonzerosPerBlock) { //adding this row will exceed the nnz per block
           diag.NrowBlocks++; //count the previous block
@@ -521,7 +483,7 @@ void parCSR::syncToDevice() {
         }
       }
 
-      diag.blockRowStarts  = (dlong*) calloc(diag.NrowBlocks+1,sizeof(dlong));
+      diag.blockRowStarts.malloc(diag.NrowBlocks+1, 0);
 
       blockSum=0;
       diag.NrowBlocks=1;
@@ -536,11 +498,11 @@ void parCSR::syncToDevice() {
         }
       }
       diag.blockRowStarts[diag.NrowBlocks] = Nrows;
-      diag.o_blockRowStarts = platform.malloc((diag.NrowBlocks+1)*sizeof(dlong), diag.blockRowStarts);
+      diag.o_blockRowStarts = platform.malloc<dlong>(diag.blockRowStarts);
 
       //transfer matrix data
-      diag.o_cols = platform.malloc(diag.nnz*sizeof(dlong),   diag.cols);
-      diag.o_vals = platform.malloc(diag.nnz*sizeof(pfloat),  diag.vals);
+      diag.o_cols = platform.malloc<dlong>(diag.cols);
+      diag.o_vals = platform.malloc<pfloat>(diag.vals);
     }
 
     if (offd.nnz) {
@@ -550,12 +512,9 @@ void parCSR::syncToDevice() {
       for (dlong i=0;i<offd.nzRows;i++) {
         dlong rowSize = offd.mRowStarts[i+1]-offd.mRowStarts[i];
 
-        if (rowSize > parAlmond::NonzerosPerBlock) {
-          //this row is pathalogically big. We can't currently run this
-          stringstream ss;
-          ss << "Multiplicity of row: " << i << " is " << rowSize << " in parAlmond::parCSR setup and is too large.";
-          LIBP_ABORT(ss.str())
-        }
+        //this row may be pathalogically big. We can't currently run this
+        LIBP_ABORT("Multiplicity of row: " << i << " is " << rowSize << " in parAlmond::parCSR setup and is too large.",
+                   rowSize > parAlmond::NonzerosPerBlock);
 
         if (blockSum+rowSize > parAlmond::NonzerosPerBlock) { //adding this row will exceed the nnz per block
           offd.NrowBlocks++; //count the previous block
@@ -565,7 +524,7 @@ void parCSR::syncToDevice() {
         }
       }
 
-      offd.blockRowStarts  = (dlong*) calloc(offd.NrowBlocks+1,sizeof(dlong));
+      offd.blockRowStarts.malloc(offd.NrowBlocks+1, 0);
 
       blockSum=0;
       offd.NrowBlocks=1;
@@ -580,21 +539,23 @@ void parCSR::syncToDevice() {
         }
       }
       offd.blockRowStarts[offd.NrowBlocks] = offd.nzRows;
-      offd.o_blockRowStarts = platform.malloc((offd.NrowBlocks+1)*sizeof(dlong), offd.blockRowStarts);
+      offd.o_blockRowStarts = platform.malloc<dlong>(offd.blockRowStarts);
 
       //transfer matrix data
-      offd.o_rows       = platform.malloc(offd.nzRows*sizeof(dlong), offd.rows);
-      offd.o_mRowStarts = platform.malloc((offd.nzRows+1)*sizeof(dlong), offd.mRowStarts);
+      offd.o_rows       = platform.malloc<dlong>(offd.rows);
+      offd.o_mRowStarts = platform.malloc<dlong>(offd.mRowStarts);
 
-      offd.o_cols = platform.malloc(offd.nnz*sizeof(dlong),   offd.cols);
-      offd.o_vals = platform.malloc(offd.nnz*sizeof(pfloat),  offd.vals);
+      offd.o_cols = platform.malloc<dlong>(offd.cols);
+      offd.o_vals = platform.malloc<pfloat>(offd.vals);
     }
 
-    if (diagA) {
-      o_diagA = platform.malloc(Nrows*sizeof(dfloat), diagA);
-      o_diagInv = platform.malloc(Nrows*sizeof(dfloat), diagInv);
+    if (diagA.size()) {
+      o_diagA = platform.malloc<dfloat>(diagA);
+      o_diagInv = platform.malloc<dfloat>(diagInv);
     }
   }
 }
 
 } //namespace parAlmond
+
+} //namespace libp
diff --git a/libs/timeStepper/okl/timeStepperAB.okl b/libs/timeStepper/okl/timeStepperAB.okl
index 301ad129a..62595718c 100644
--- a/libs/timeStepper/okl/timeStepperAB.okl
+++ b/libs/timeStepper/okl/timeStepperAB.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/libs/timeStepper/okl/timeStepperDOPRI5.okl b/libs/timeStepper/okl/timeStepperDOPRI5.okl
index 0965545f4..f4916e8fa 100644
--- a/libs/timeStepper/okl/timeStepperDOPRI5.okl
+++ b/libs/timeStepper/okl/timeStepperDOPRI5.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -132,28 +132,19 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 #if p_blockSize>512
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<512) s_err[t] += s_err[t+512];
-    @barrier("local");
 #endif
 #if p_blockSize>256
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<256) s_err[t] += s_err[t+256];
-    @barrier("local");
 #endif
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<128) s_err[t] += s_err[t+128];
-    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 64) s_err[t] += s_err[t+64];
-    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 32) s_err[t] += s_err[t+32];
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 16) s_err[t] += s_err[t+16];
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  8) s_err[t] += s_err[t+8];
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  4) s_err[t] += s_err[t+4];
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  2) s_err[t] += s_err[t+2];
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  1) errtmp[b] = s_err[0] + s_err[1];
   }
 }
diff --git a/libs/timeStepper/okl/timeStepperEXTBDF.okl b/libs/timeStepper/okl/timeStepperEXTBDF.okl
index ca70375bc..f29410a79 100644
--- a/libs/timeStepper/okl/timeStepperEXTBDF.okl
+++ b/libs/timeStepper/okl/timeStepperEXTBDF.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/libs/timeStepper/okl/timeStepperLSERK4.okl b/libs/timeStepper/okl/timeStepperLSERK4.okl
index 33770a3b8..13b77ae2e 100644
--- a/libs/timeStepper/okl/timeStepperLSERK4.okl
+++ b/libs/timeStepper/okl/timeStepperLSERK4.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/libs/timeStepper/okl/timeStepperMRAB.okl b/libs/timeStepper/okl/timeStepperMRAB.okl
index 609fd7956..e28d43c89 100644
--- a/libs/timeStepper/okl/timeStepperMRAB.okl
+++ b/libs/timeStepper/okl/timeStepperMRAB.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -72,9 +72,6 @@ SOFTWARE.
       }
     }
 
-    // make sure all node data is loaded into @shared
-    @barrier("local");
-
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
 
       // Update q
@@ -149,9 +146,6 @@ SOFTWARE.
       }
     }
 
-    // make sure all node data is loaded into @shared
-    @barrier("local");
-
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
 
       if(n<p_Nfaces*p_Nfp){
@@ -213,4 +207,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/libs/timeStepper/okl/timeStepperMRSAAB.okl b/libs/timeStepper/okl/timeStepperMRSAAB.okl
index 023bd6451..c2dea33b6 100644
--- a/libs/timeStepper/okl/timeStepperMRSAAB.okl
+++ b/libs/timeStepper/okl/timeStepperMRSAAB.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -74,9 +74,6 @@ SOFTWARE.
       }
     }
 
-    // make sure all node data is loaded into @shared
-    @barrier("local");
-
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
 
       // Update q
@@ -153,9 +150,6 @@ SOFTWARE.
       }
     }
 
-    // make sure all node data is loaded into @shared
-    @barrier("local");
-
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
 
       if(n<p_Nfaces*p_Nfp){
@@ -217,4 +211,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/libs/timeStepper/okl/timeStepperSAAB.okl b/libs/timeStepper/okl/timeStepperSAAB.okl
index 0106054d6..559ca0660 100644
--- a/libs/timeStepper/okl/timeStepperSAAB.okl
+++ b/libs/timeStepper/okl/timeStepperSAAB.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -78,4 +78,4 @@ SOFTWARE.
 
     q[n] += dt*dq;
   }
-}
\ No newline at end of file
+}
diff --git a/libs/timeStepper/okl/timeStepperSARK.okl b/libs/timeStepper/okl/timeStepperSARK.okl
index f3bb702e5..7543deb70 100644
--- a/libs/timeStepper/okl/timeStepperSARK.okl
+++ b/libs/timeStepper/okl/timeStepperSARK.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -120,28 +120,20 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 #if p_blockSize>512
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<512) s_err[t] += s_err[t+512];
-    @barrier("local");
 #endif
 #if p_blockSize>256
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<256) s_err[t] += s_err[t+256];
-    @barrier("local");
 #endif
 
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<128) s_err[t] += s_err[t+128];
-    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 64) s_err[t] += s_err[t+64];
-    @barrier("local");
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 32) s_err[t] += s_err[t+32];
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t< 16) s_err[t] += s_err[t+16];
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  8) s_err[t] += s_err[t+8];
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  4) s_err[t] += s_err[t+4];
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  2) s_err[t] += s_err[t+2];
-
     for(int t=0;t<p_blockSize;++t;@inner(0)) if(t<  1) errtmp[b] = s_err[0] + s_err[1];
   }
 }
diff --git a/libs/timeStepper/okl/timeStepperSSBDF.okl b/libs/timeStepper/okl/timeStepperSSBDF.okl
index 95562fcbe..e43689522 100644
--- a/libs/timeStepper/okl/timeStepperSSBDF.okl
+++ b/libs/timeStepper/okl/timeStepperSSBDF.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/libs/timeStepper/timeStepper.cpp b/libs/timeStepper/timeStepper.cpp
new file mode 100644
index 000000000..03d1d0669
--- /dev/null
+++ b/libs/timeStepper/timeStepper.cpp
@@ -0,0 +1,59 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "core.hpp"
+#include "timeStepper.hpp"
+
+namespace libp {
+
+void timeStepper_t::Run(solver_t& solver,
+                        deviceMemory<dfloat>& o_q,
+                        dfloat start, dfloat end) {
+  assertInitialized();
+  ts->Run(solver, o_q, start, end);
+}
+
+void timeStepper_t::SetTimeStep(dfloat dt_) {
+  assertInitialized();
+  ts->SetTimeStep(dt_);
+}
+
+dfloat timeStepper_t::GetTimeStep() {
+  assertInitialized();
+  return ts->dt;
+}
+
+dfloat timeStepper_t::GetGamma() {
+  assertInitialized();
+  return ts->GetGamma();
+}
+
+void timeStepper_t::assertInitialized() {
+  LIBP_ABORT("timeStepper_t not initialized",
+             ts==nullptr);
+}
+
+} //namespace libp
diff --git a/libs/timeStepper/timeStepperAB3.cpp b/libs/timeStepper/timeStepperAB3.cpp
index 3c60e01eb..fc2249e31 100644
--- a/libs/timeStepper/timeStepperAB3.cpp
+++ b/libs/timeStepper/timeStepperAB3.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,23 +27,28 @@ SOFTWARE.
 #include "core.hpp"
 #include "timeStepper.hpp"
 
+namespace libp {
+
 namespace TimeStepper {
 
 /* Adams Bashforth, order 3 */
 ab3::ab3(dlong Nelements, dlong NhaloElements,
-         int Np, int Nfields, solver_t& _solver):
-  timeStepper_t(Nelements, NhaloElements, Np, Nfields, _solver) {
-
-  platform_t &platform = solver.platform;
+         int Np, int Nfields,
+         platform_t& _platform, comm_t _comm):
+  timeStepperBase_t(Nelements, NhaloElements, Np, Nfields,
+                    _platform, _comm) {
 
   Nstages = 3;
   shiftIndex = 0;
 
-  o_rhsq = platform.malloc(Nstages*N*sizeof(dfloat));
+  memory<dfloat> rhsq(Nstages*N,0.0);
+  o_rhsq = platform.malloc<dfloat>(rhsq);
+
+  properties_t kernelInfo = platform.props(); //copy base occa properties from solver
 
-  occa::properties kernelInfo = platform.props; //copy base occa properties from solver
+  const int blocksize=256;
 
-  kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE;
+  kernelInfo["defines/" "p_blockSize"] = blocksize;
   kernelInfo["defines/" "p_Nstages"] = Nstages;
 
   updateKernel = platform.buildKernel(TIMESTEPPER_DIR "/okl/"
@@ -57,19 +62,19 @@ ab3::ab3(dlong Nelements, dlong NhaloElements,
                          3./2.,   -1./2.,    0.0,
                        23./12., -16./12., 5./12.};
 
-  ab_a = (dfloat*) calloc(Nstages*Nstages, sizeof(dfloat));
-  memcpy(ab_a, _ab_a, Nstages*Nstages*sizeof(dfloat));
+  ab_a.malloc(Nstages*Nstages);
+  ab_a.copyFrom(_ab_a);
 
-  o_ab_a = platform.malloc(Nstages*Nstages*sizeof(dfloat), ab_a);
+  o_ab_a = platform.malloc<dfloat>(ab_a);
 }
 
-void ab3::Run(occa::memory &o_q, dfloat start, dfloat end) {
+void ab3::Run(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat start, dfloat end) {
 
   dfloat time = start;
 
   solver.Report(time,0);
 
-  dfloat outputInterval;
+  dfloat outputInterval=0.0;
   solver.settings.getSetting("OUTPUT INTERVAL", outputInterval);
 
   dfloat outputTime = time + outputInterval;
@@ -77,7 +82,7 @@ void ab3::Run(occa::memory &o_q, dfloat start, dfloat end) {
   int tstep=0;
   int order=0;
   while (time < end) {
-    Step(o_q, time, dt, order);
+    Step(solver, o_q, time, dt, order);
     time += dt;
     tstep++;
     if (order<Nstages-1) order++;
@@ -90,13 +95,13 @@ void ab3::Run(occa::memory &o_q, dfloat start, dfloat end) {
   }
 }
 
-void ab3::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) {
+void ab3::Step(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat time, dfloat _dt, int order) {
 
   //rhs at current index
-  occa::memory o_rhsq0 = o_rhsq + shiftIndex*N*sizeof(dfloat);
+  deviceMemory<dfloat> o_rhsq0 = o_rhsq + shiftIndex*N;
 
   //A coefficients at current order
-  occa::memory o_A = o_ab_a + order*Nstages*sizeof(dfloat);
+  deviceMemory<dfloat> o_A = o_ab_a + order*Nstages;
 
   //evaluate ODE rhs = f(q,t)
   solver.rhsf(o_q, o_rhsq0, time);
@@ -113,45 +118,33 @@ void ab3::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) {
   shiftIndex = (shiftIndex+Nstages-1)%Nstages;
 }
 
-ab3::~ab3() {
-  if (o_rhsq.size()) o_rhsq.free();
-  if (o_ab_a.size()) o_ab_a.free();
-
-  if (ab_a) free(ab_a);
-
-  updateKernel.free();
-}
-
 /**************************************************/
 /* PML version                                    */
 /**************************************************/
 
 /* Adams Bashforth, order 3 */
 ab3_pml::ab3_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements,
-                int Np, int Nfields, int Npmlfields, solver_t& _solver):
-  ab3(Nelements, NhaloElements, Np, Nfields, _solver),
+                int Np, int Nfields, int Npmlfields,
+                platform_t& _platform, comm_t _comm):
+  ab3(Nelements, NhaloElements, Np, Nfields, _platform, _comm),
   Npml(NpmlElements*Np*Npmlfields) {
 
   if (Npml) {
-    platform_t &platform = solver.platform;
-
-    dfloat *pmlq = (dfloat *) calloc(Npml,sizeof(dfloat));
-    o_pmlq   = platform.malloc(Npml*sizeof(dfloat), pmlq);
-    free(pmlq);
-
-    o_rhspmlq = platform.malloc(Nstages*Npml*sizeof(dfloat));
+    memory<dfloat> pmlq(Npml,0.0);
+    o_pmlq   = platform.malloc<dfloat>(pmlq);
+    o_rhspmlq = platform.malloc<dfloat>(Nstages*Npml);
   }
 }
 
-void ab3_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) {
+void ab3_pml::Step(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat time, dfloat _dt, int order) {
 
   //rhs at current index
-  occa::memory o_rhsq0    = o_rhsq    + shiftIndex*N*sizeof(dfloat);
-  occa::memory o_rhspmlq0;
-  if (Npml)    o_rhspmlq0 = o_rhspmlq + shiftIndex*Npml*sizeof(dfloat);
+  deviceMemory<dfloat> o_rhsq0 = o_rhsq + shiftIndex*N;
+  deviceMemory<dfloat> o_rhspmlq0;
+  if (Npml)    o_rhspmlq0 = o_rhspmlq + shiftIndex*Npml;
 
   //A coefficients at current order
-  occa::memory o_A = o_ab_a + order*Nstages*sizeof(dfloat);
+  deviceMemory<dfloat> o_A = o_ab_a + order*Nstages;
 
   //evaluate ODE rhs = f(q,t)
   solver.rhsf_pml(o_q, o_pmlq, o_rhsq0, o_rhspmlq0, time);
@@ -175,9 +168,6 @@ void ab3_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) {
   shiftIndex = (shiftIndex+Nstages-1)%Nstages;
 }
 
-ab3_pml::~ab3_pml() {
-  if (o_pmlq.size()) o_pmlq.free();
-  if (o_rhspmlq.size()) o_rhspmlq.free();
-}
-
 } //namespace TimeStepper
+
+} //namespace libp
diff --git a/libs/timeStepper/timeStepperDOPRI5.cpp b/libs/timeStepper/timeStepperDOPRI5.cpp
index b956aab57..ac0c0237c 100644
--- a/libs/timeStepper/timeStepperDOPRI5.cpp
+++ b/libs/timeStepper/timeStepperDOPRI5.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,36 +27,38 @@ SOFTWARE.
 #include "core.hpp"
 #include "timeStepper.hpp"
 
+namespace libp {
+
 namespace TimeStepper {
 
 dopri5::dopri5(dlong Nelements, dlong NhaloElements,
-               int Np, int Nfields, solver_t& _solver, MPI_Comm _comm):
-  timeStepper_t(Nelements, NhaloElements, Np, Nfields, _solver), comm(_comm) {
-
-  platform_t &platform = solver.platform;
+               int Np, int Nfields,
+               platform_t& _platform, comm_t _comm):
+  timeStepperBase_t(Nelements, NhaloElements,
+                    Np, Nfields, _platform, _comm) {
 
   Nrk = 7;
 
-  o_rhsq   = platform.malloc(N*sizeof(dfloat));
-  o_rkq    = platform.malloc((N+Nhalo)*sizeof(dfloat));
-  o_rkrhsq = platform.malloc(Nrk*N*sizeof(dfloat));
-  o_rkerr  = platform.malloc(N*sizeof(dfloat));
+  o_rhsq   = platform.malloc<dfloat>(N);
+  o_rkq    = platform.malloc<dfloat>(N+Nhalo);
+  o_rkrhsq = platform.malloc<dfloat>(Nrk*N);
+  o_rkerr  = platform.malloc<dfloat>(N);
+
+  o_saveq  = platform.malloc<dfloat>(N);
 
-  o_saveq  = platform.malloc(N*sizeof(dfloat));
+  const int blocksize = 256;
 
-  Nblock = (N+BLOCKSIZE-1)/BLOCKSIZE;
-  errtmp = (dfloat*) platform.hostMalloc(Nblock*sizeof(dfloat),
-                                          NULL, h_errtmp);
-  o_errtmp = platform.malloc(Nblock*sizeof(dfloat));
+  Nblock = (N+blocksize-1)/blocksize;
+  h_errtmp = platform.hostMalloc<dfloat>(Nblock);
+  o_errtmp = platform.malloc<dfloat>(Nblock);
 
-  hlong Nlocal = N;
-  hlong Ntotal;
-  MPI_Allreduce(&Nlocal, &Ntotal, 1, MPI_HLONG, MPI_SUM, comm);
+  hlong Ntotal = N;
+  comm.Allreduce(Ntotal);
 
-  occa::properties kernelInfo = platform.props; //copy base occa properties from solver
+  properties_t kernelInfo = platform.props(); //copy base occa properties from solver
 
   //add defines
-  kernelInfo["defines/" "p_blockSize"] = (int)BLOCKSIZE;
+  kernelInfo["defines/" "p_blockSize"] = (int)blocksize;
 
   rkUpdateKernel = platform.buildKernel(TIMESTEPPER_DIR "/okl/"
                                     "timeStepperDOPRI5.okl",
@@ -84,16 +86,16 @@ dopri5::dopri5(dlong Nelements, dlong NhaloElements,
                           35.0/384.0,             0.0,   500.0/1113.0,  125.0/192.0,  -2187.0/6784.0, 11.0/84.0, 0.0 };
   dfloat _rkE[7] = {71.0/57600.0,  0.0, -71.0/16695.0, 71.0/1920.0, -17253.0/339200.0, 22.0/525.0, -1.0/40.0 };
 
-  rkC = (dfloat*) calloc(Nrk, sizeof(dfloat));
-  rkE = (dfloat*) calloc(Nrk, sizeof(dfloat));
-  rkA = (dfloat*) calloc(Nrk*Nrk, sizeof(dfloat));
+  rkC.malloc(Nrk);
+  rkE.malloc(Nrk);
+  rkA.malloc(Nrk*Nrk);
 
-  memcpy(rkC, _rkC, Nrk*sizeof(dfloat));
-  memcpy(rkE, _rkE, Nrk*sizeof(dfloat));
-  memcpy(rkA, _rkA, Nrk*Nrk*sizeof(dfloat));
+  rkC.copyFrom(_rkC);
+  rkE.copyFrom(_rkE);
+  rkA.copyFrom(_rkA);
 
-  o_rkA = platform.malloc(Nrk*Nrk*sizeof(dfloat), rkA);
-  o_rkE = platform.malloc(Nrk*sizeof(dfloat), rkE);
+  o_rkA = platform.malloc<dfloat>(rkA);
+  o_rkE = platform.malloc<dfloat>(rkE);
 
   dtMIN = 1E-9; //minumum allowed timestep
   ATOL = 1E-6;  //absolute error tolerance
@@ -112,16 +114,16 @@ dopri5::dopri5(dlong Nelements, dlong NhaloElements,
   sqrtinvNtotal = 1.0/sqrt(Ntotal);
 }
 
-void dopri5::Run(occa::memory &o_q, dfloat start, dfloat end) {
+void dopri5::Run(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat start, dfloat end) {
 
   dfloat time = start;
 
   // int rank;
-  // MPI_Comm_rank(comm, &rank);
+  // comm_rank_t(comm, &rank);
 
   solver.Report(time,0);
 
-  dfloat outputInterval;
+  dfloat outputInterval=0.0;
   solver.settings.getSetting("OUTPUT INTERVAL", outputInterval);
 
   dfloat outputTime = time + outputInterval;
@@ -130,23 +132,17 @@ void dopri5::Run(occa::memory &o_q, dfloat start, dfloat end) {
 
   while (time < end) {
 
-    if (dt<dtMIN){
-      stringstream ss;
-      ss << "Time step became too small at time step = " << tstep;
-      LIBP_ABORT(ss.str());
-    }
-    if (std::isnan(dt)) {
-      stringstream ss;
-      ss << "Solution became unstable at time step = " << tstep;
-      LIBP_ABORT(ss.str());
-    }
+    LIBP_ABORT("Time step became too small at time step = " << tstep,
+               dt<dtMIN);
+    LIBP_ABORT("Solution became unstable at time step = " << tstep,
+               std::isnan(dt));
 
     //check for final timestep
     if (time+dt > end){
       dt = end-time;
     }
 
-    Step(o_q, time, dt);
+    Step(solver, o_q, time, dt);
 
     // compute Dopri estimator
     dfloat err = Estimater(o_q);
@@ -155,7 +151,7 @@ void dopri5::Run(occa::memory &o_q, dfloat start, dfloat end) {
     dfloat fac1 = pow(err,exp1);
     dfloat fac = fac1/pow(facold,beta);
 
-    fac = mymax(invfactor2, mymin(invfactor1,fac/safe));
+    fac = std::max(invfactor2, std::min(invfactor1,fac/safe));
     dfloat dtnew = dt/fac;
 
     if (err<1.0) { //dt is accepted
@@ -174,7 +170,7 @@ void dopri5::Run(occa::memory &o_q, dfloat start, dfloat end) {
         //   printf("Taking output mini step: %g\n", dt);
 
         // time step to output
-        Step(o_q, time, dt);
+        Step(solver, o_q, time, dt);
 
         // shift for output
         o_rkq.copyTo(o_q);
@@ -200,14 +196,15 @@ void dopri5::Run(occa::memory &o_q, dfloat start, dfloat end) {
       time += dt;
       while (time>outputTime) outputTime+= outputInterval; //catch up next output in case dt>outputInterval
 
-      facold = mymax(err,1E-4); // hard coded factor ?
+      constexpr dfloat errMax = 1.0e-4;  // hard coded factor ?
+      facold = std::max(err,errMax);
 
       // if (!rank)
       //   printf("\r time = %g (%d), dt = %g accepted                      ", time, allStep,  dt);
 
       tstep++;
     } else {
-      dtnew = dt/(mymax(invfactor1,fac1/safe));
+      dtnew = dt/(std::max(invfactor1,fac1/safe));
 
       // if (!rank)
       //   printf("\r time = %g (%d), dt = %g rejected, trying %g", time, allStep, dt, dtnew);
@@ -222,19 +219,19 @@ void dopri5::Run(occa::memory &o_q, dfloat start, dfloat end) {
   //   printf("%d accepted steps and %d total steps\n", tstep, allStep);
 }
 
-void dopri5::Backup(occa::memory &o_Q) {
-  o_saveq.copyFrom(o_Q, N*sizeof(dfloat));
+void dopri5::Backup(deviceMemory<dfloat> &o_Q) {
+  o_saveq.copyFrom(o_Q, N);
 }
 
-void dopri5::Restore(occa::memory &o_Q) {
-  o_saveq.copyTo(o_Q, N*sizeof(dfloat));
+void dopri5::Restore(deviceMemory<dfloat> &o_Q) {
+  o_saveq.copyTo(o_Q, N);
 }
 
-void dopri5::AcceptStep(occa::memory &o_q, occa::memory &o_rq) {
-  o_q.copyFrom(o_rq, N*sizeof(dfloat));
+void dopri5::AcceptStep(deviceMemory<dfloat> &o_q, deviceMemory<dfloat> &o_rq) {
+  o_q.copyFrom(o_rq, N);
 }
 
-void dopri5::Step(occa::memory &o_q, dfloat time, dfloat _dt) {
+void dopri5::Step(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat time, dfloat _dt) {
 
   //RK step
   for(int rk=0;rk<Nrk;++rk){
@@ -273,7 +270,7 @@ void dopri5::Step(occa::memory &o_q, dfloat time, dfloat _dt) {
   }
 }
 
-dfloat dopri5::Estimater(occa::memory& o_q){
+dfloat dopri5::Estimater(deviceMemory<dfloat>& o_q){
 
   //Error estimation
   //E. HAIRER, S.P. NORSETT AND G. WANNER, SOLVING ORDINARY
@@ -286,63 +283,44 @@ dfloat dopri5::Estimater(occa::memory& o_q){
                         o_rkerr,
                         o_errtmp);
 
-  o_errtmp.copyTo(errtmp);
-  dfloat localerr = 0;
+  h_errtmp.copyFrom(o_errtmp);
   dfloat err = 0;
   for(dlong n=0;n<Nblock;++n){
-    localerr += errtmp[n];
+    err += h_errtmp[n];
   }
-  MPI_Allreduce(&localerr, &err, 1, MPI_DFLOAT, MPI_SUM, comm);
+  comm.Allreduce(err);
 
   err = sqrt(err)*sqrtinvNtotal;
 
   return err;
 }
 
-dopri5::~dopri5() {
-  if (o_rkq.size()) o_rkq.free();
-  if (o_rkrhsq.size()) o_rkrhsq.free();
-  if (o_rkerr.size()) o_rkerr.free();
-  if (o_errtmp.size()) o_errtmp.free();
-  if (o_rkA.size()) o_rkA.free();
-  if (o_rkE.size()) o_rkE.free();
-
-  if (rkC) free(rkC);
-  if (rkA) free(rkA);
-  if (rkE) free(rkE);
-
-  rkUpdateKernel.free();
-  rkStageKernel.free();
-  rkErrorEstimateKernel.free();
-}
-
 /**************************************************/
 /* PML version                                    */
 /**************************************************/
 
 dopri5_pml::dopri5_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements,
                       int Np, int Nfields, int Npmlfields,
-                      solver_t& _solver, MPI_Comm _comm):
-  dopri5(Nelements, NhaloElements, Np, Nfields, _solver, _comm),
+                      platform_t& _platform, comm_t _comm):
+  dopri5(Nelements, NhaloElements, Np, Nfields, _platform, _comm),
   Npml(Npmlfields*Np*NpmlElements) {
 
   if (Npml) {
-    platform_t &platform = solver.platform;
+    memory<dfloat> pmlq(Npml,0.0);
+    o_pmlq = platform.malloc<dfloat>(pmlq);
 
-    dfloat *pmlq = (dfloat *) calloc(Npml,sizeof(dfloat));
-    o_pmlq   = platform.malloc(Npml*sizeof(dfloat), pmlq);
-    free(pmlq);
+    o_rhspmlq   = platform.malloc<dfloat>(Npml);
+    o_rkpmlq    = platform.malloc<dfloat>(Npml);
+    o_rkrhspmlq = platform.malloc<dfloat>(Nrk*Npml);
 
-    o_rhspmlq   = platform.malloc(Npml*sizeof(dfloat));
-    o_rkpmlq    = platform.malloc(Npml*sizeof(dfloat));
-    o_rkrhspmlq = platform.malloc(Nrk*Npml*sizeof(dfloat));
+    o_savepmlq  = platform.malloc<dfloat>(Npml);
 
-    o_savepmlq  = platform.malloc(Npml*sizeof(dfloat));
+    properties_t kernelInfo = platform.props(); //copy base occa properties from solver
 
-    occa::properties kernelInfo = platform.props; //copy base occa properties from solver
+    const int blocksize = 256;
 
     //add defines
-    kernelInfo["defines/" "p_blockSize"] = (int)BLOCKSIZE;
+    kernelInfo["defines/" "p_blockSize"] = (int)blocksize;
 
     rkPmlUpdateKernel = platform.buildKernel(TIMESTEPPER_DIR "/okl/"
                                       "timeStepperDOPRI5.okl",
@@ -351,25 +329,25 @@ dopri5_pml::dopri5_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements,
   }
 }
 
-void dopri5_pml::Backup(occa::memory &o_Q) {
-  o_saveq.copyFrom(o_Q, N*sizeof(dfloat));
+void dopri5_pml::Backup(deviceMemory<dfloat> &o_Q) {
+  o_saveq.copyFrom(o_Q, N);
   if (Npml)
-    o_savepmlq.copyFrom(o_rkpmlq, Npml*sizeof(dfloat));
+    o_savepmlq.copyFrom(o_rkpmlq, Npml);
 }
 
-void dopri5_pml::Restore(occa::memory &o_Q) {
-  o_saveq.copyTo(o_Q, N*sizeof(dfloat));
+void dopri5_pml::Restore(deviceMemory<dfloat> &o_Q) {
+  o_saveq.copyTo(o_Q, N);
   if (Npml)
-    o_savepmlq.copyTo(o_rkpmlq, Npml*sizeof(dfloat));
+    o_savepmlq.copyTo(o_rkpmlq, Npml);
 }
 
-void dopri5_pml::AcceptStep(occa::memory &o_q, occa::memory &o_rq) {
-  o_q.copyFrom(o_rq, N*sizeof(dfloat));
+void dopri5_pml::AcceptStep(deviceMemory<dfloat> &o_q, deviceMemory<dfloat> &o_rq) {
+  o_q.copyFrom(o_rq, N);
   if (Npml)
-    o_pmlq.copyFrom(o_rkpmlq, Npml*sizeof(dfloat));
+    o_pmlq.copyFrom(o_rkpmlq, Npml);
 }
 
-void dopri5_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt) {
+void dopri5_pml::Step(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat time, dfloat _dt) {
 
   //RK step
   for(int rk=0;rk<Nrk;++rk){
@@ -425,12 +403,6 @@ void dopri5_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt) {
   }
 }
 
-dopri5_pml::~dopri5_pml() {
-  if (o_pmlq.size()) o_pmlq.free();
-  if (o_rkpmlq.size()) o_rkpmlq.free();
-  if (o_rhspmlq.size()) o_rhspmlq.free();
-  if (o_rkrhspmlq.size()) o_rkrhspmlq.free();
-  if (o_savepmlq.size()) o_savepmlq.free();
-}
-
 } //namespace TimeStepper
+
+} //namespace libp
diff --git a/libs/timeStepper/timeStepperEXTBDF3.cpp b/libs/timeStepper/timeStepperEXTBDF3.cpp
index 4cd9da9d3..8d6b28dec 100644
--- a/libs/timeStepper/timeStepperEXTBDF3.cpp
+++ b/libs/timeStepper/timeStepperEXTBDF3.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,31 +27,33 @@ SOFTWARE.
 #include "core.hpp"
 #include "timeStepper.hpp"
 
+namespace libp {
+
 namespace TimeStepper {
 
 /* Backward Difference Formula, order 3, with extrapolation */
 extbdf3::extbdf3(dlong Nelements, dlong NhaloElements,
-                 int Np, int Nfields, solver_t& _solver):
-  timeStepper_t(Nelements, NhaloElements, Np, Nfields, _solver) {
-
-  platform_t &platform = solver.platform;
+                 int Np, int Nfields,
+                 platform_t& _platform, comm_t _comm):
+  timeStepperBase_t(Nelements, NhaloElements, Np, Nfields,
+                    _platform, _comm) {
 
   Nstages = 3;
   shiftIndex = 0;
 
-  dfloat *qn = (dfloat *) calloc(Nstages*N,sizeof(dfloat));
-  o_qn = platform.malloc(Nstages*N*sizeof(dfloat),qn); //q history
+  memory<dfloat> qn(Nstages*N, 0.0);
+  o_qn = platform.malloc<dfloat>(qn); //q history
 
-  dfloat *rhs = (dfloat *) calloc(N,sizeof(dfloat));
-  o_rhs = platform.malloc(N*sizeof(dfloat), rhs); //rhs storage
-  free(rhs);
+  memory<dfloat> rhs(N,0.0);
+  o_rhs = platform.malloc<dfloat>(rhs); //rhs storage
 
-  o_F  = platform.malloc(Nstages*N*sizeof(dfloat), qn); //F(q) history (explicit part)
-  free(qn);
+  o_F  = platform.malloc<dfloat>(qn); //F(q) history (explicit part)
 
-  occa::properties kernelInfo = platform.props; //copy base occa properties from solver
+  properties_t kernelInfo = platform.props(); //copy base occa properties from solver
 
-  kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE;
+  const int blocksize=256;
+
+  kernelInfo["defines/" "p_blockSize"] = blocksize;
   kernelInfo["defines/" "p_Nstages"] = Nstages;
 
   rhsKernel = platform.buildKernel(TIMESTEPPER_DIR "/okl/"
@@ -69,26 +71,26 @@ extbdf3::extbdf3(dlong Nelements, dlong NhaloElements,
                          3./2.,    2., -1./2.,    0.,
                         11./6.,    3., -3./2., 1./3.};
 
-  extbdf_a = (dfloat*) calloc(Nstages*Nstages, sizeof(dfloat));
-  extbdf_b = (dfloat*) calloc(Nstages*(Nstages+1), sizeof(dfloat));
-  memcpy(extbdf_a, _a, Nstages*Nstages*sizeof(dfloat));
-  memcpy(extbdf_b, _b, Nstages*(Nstages+1)*sizeof(dfloat));
+  extbdf_a.malloc(Nstages*Nstages);
+  extbdf_b.malloc(Nstages*(Nstages+1));
+  extbdf_a.copyFrom(_a);
+  extbdf_b.copyFrom(_b);
 
-  o_extbdf_a = platform.malloc(Nstages*Nstages*sizeof(dfloat), extbdf_a);
-  o_extbdf_b = platform.malloc(Nstages*(Nstages+1)*sizeof(dfloat), extbdf_b);
+  o_extbdf_a = platform.malloc<dfloat>(extbdf_a);
+  o_extbdf_b = platform.malloc<dfloat>(extbdf_b);
 }
 
-dfloat extbdf3::getGamma() {
-  return *(extbdf_b + (Nstages-1)*(Nstages+1)); //first entry of last row of B
+dfloat extbdf3::GetGamma() {
+  return extbdf_b[(Nstages-1)*(Nstages+1)]; //first entry of last row of B
 }
 
-void extbdf3::Run(occa::memory &o_q, dfloat start, dfloat end) {
+void extbdf3::Run(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat start, dfloat end) {
 
   dfloat time = start;
 
   solver.Report(time,0);
 
-  dfloat outputInterval;
+  dfloat outputInterval=0.0;
   solver.settings.getSetting("OUTPUT INTERVAL", outputInterval);
 
   dfloat outputTime = time + outputInterval;
@@ -96,7 +98,7 @@ void extbdf3::Run(occa::memory &o_q, dfloat start, dfloat end) {
   int tstep=0;
   int order=0;
   while (time < end) {
-    Step(o_q, time, dt, order);
+    Step(solver, o_q, time, dt, order);
     time += dt;
     tstep++;
     if (order<Nstages-1) order++;
@@ -109,15 +111,15 @@ void extbdf3::Run(occa::memory &o_q, dfloat start, dfloat end) {
   }
 }
 
-void extbdf3::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) {
+void extbdf3::Step(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat time, dfloat _dt, int order) {
 
   //F(q) at current index
-  occa::memory o_F0 = o_F + shiftIndex*N*sizeof(dfloat);
+  deviceMemory<dfloat> o_F0 = o_F + shiftIndex*N;
 
   //coefficients at current order
-  occa::memory o_A = o_extbdf_a + order*Nstages*sizeof(dfloat);
-  occa::memory o_B = o_extbdf_b + order*(Nstages+1)*sizeof(dfloat);
-  dfloat *B = extbdf_b + order*(Nstages+1);
+  deviceMemory<dfloat> o_A = o_extbdf_a + order*Nstages;
+  deviceMemory<dfloat> o_B = o_extbdf_b + order*(Nstages+1);
+  memory<dfloat> B = extbdf_b + order*(Nstages+1);
 
   //evaluate explicit part of rhs: F(q,t)
   solver.rhs_imex_f(o_q, o_F0, time);
@@ -143,17 +145,6 @@ void extbdf3::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) {
   shiftIndex = (shiftIndex+Nstages-1)%Nstages;
 }
 
-extbdf3::~extbdf3() {
-  if (o_rhs.size()) o_rhs.free();
-  if (o_qn.size()) o_qn.free();
-  if (o_F.size()) o_F.free();
-  if (o_extbdf_a.size()) o_extbdf_a.free();
-  if (o_extbdf_b.size()) o_extbdf_b.free();
-
-  if (extbdf_a) free(extbdf_a);
-  if (extbdf_b) free(extbdf_b);
-
-  rhsKernel.free();
-}
-
 } //namespace TimeStepper
+
+} //namespace libp
diff --git a/libs/timeStepper/timeStepperLSERK4.cpp b/libs/timeStepper/timeStepperLSERK4.cpp
index c7ebdea74..2ffa4e4eb 100644
--- a/libs/timeStepper/timeStepperLSERK4.cpp
+++ b/libs/timeStepper/timeStepperLSERK4.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,22 +27,28 @@ SOFTWARE.
 #include "core.hpp"
 #include "timeStepper.hpp"
 
+namespace libp {
+
 namespace TimeStepper {
 
 lserk4::lserk4(dlong Nelements, dlong NhaloElements,
-               int Np, int Nfields, solver_t& _solver):
-  timeStepper_t(Nelements, NhaloElements, Np, Nfields, _solver) {
-
-  platform_t &platform = solver.platform;
+               int Np, int Nfields,
+               platform_t& _platform, comm_t _comm):
+  timeStepperBase_t(Nelements, NhaloElements, Np, Nfields,
+                    _platform, _comm) {
 
   Nrk = 5;
 
-  o_resq = platform.malloc(N*sizeof(dfloat));
-  o_rhsq = platform.malloc(N*sizeof(dfloat));
+  o_resq = platform.malloc<dfloat>(N);
+  o_rhsq = platform.malloc<dfloat>(N);
+
+  o_saveq = platform.malloc<dfloat>(N);
 
-  occa::properties kernelInfo = platform.props; //copy base occa properties from solver
+  properties_t kernelInfo = platform.props(); //copy base occa properties from solver
 
-  kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE;
+  const int blocksize=256;
+
+  kernelInfo["defines/" "p_blockSize"] = blocksize;
 
   updateKernel = platform.buildKernel(TIMESTEPPER_DIR "/okl/"
                                     "timeStepperLSERK4.okl",
@@ -68,23 +74,21 @@ lserk4::lserk4(dlong Nelements, dlong NhaloElements,
        2802321613138.0/2924317926251.0 ,
        1.0};
 
-  rka = (dfloat*) calloc(Nrk, sizeof(dfloat));
-  rkb = (dfloat*) calloc(Nrk, sizeof(dfloat));
-  rkc = (dfloat*) calloc(Nrk+1, sizeof(dfloat));
-  memcpy(rka, _rka, Nrk*sizeof(dfloat));
-  memcpy(rkb, _rkb, Nrk*sizeof(dfloat));
-  memcpy(rkc, _rkc, (Nrk+1)*sizeof(dfloat));
+  rka.malloc(Nrk);
+  rkb.malloc(Nrk);
+  rkc.malloc(Nrk+1);
+  rka.copyFrom(_rka);
+  rkb.copyFrom(_rkb);
+  rkc.copyFrom(_rkc);
 }
 
-void lserk4::Run(occa::memory &o_q, dfloat start, dfloat end) {
-
-  platform_t &platform = solver.platform;
+void lserk4::Run(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat start, dfloat end) {
 
   dfloat time = start;
 
   solver.Report(time,0);
 
-  dfloat outputInterval;
+  dfloat outputInterval=0.0;
   solver.settings.getSetting("OUTPUT INTERVAL", outputInterval);
 
   dfloat outputTime = time + outputInterval;
@@ -96,20 +100,18 @@ void lserk4::Run(occa::memory &o_q, dfloat start, dfloat end) {
     if (time<outputTime && time+dt>=outputTime) {
 
       //save current state
-      occa::memory o_saveq = platform.malloc(N*sizeof(dfloat));
-      o_saveq.copyFrom(o_q, N*sizeof(dfloat));
+      o_saveq.copyFrom(o_q, N);
 
       stepdt = outputTime-time;
 
       //take small time step
-      Step(o_q, time, stepdt);
+      Step(solver, o_q, time, stepdt);
 
       //report state
       solver.Report(outputTime,tstep);
 
       //restore previous state
-      o_q.copyFrom(o_saveq, N*sizeof(dfloat));
-      o_saveq.free();
+      o_q.copyFrom(o_saveq, N);
 
       outputTime += outputInterval;
     }
@@ -121,13 +123,13 @@ void lserk4::Run(occa::memory &o_q, dfloat start, dfloat end) {
       stepdt = dt;
     }
 
-    Step(o_q, time, stepdt);
+    Step(solver, o_q, time, stepdt);
     time += stepdt;
     tstep++;
   }
 }
 
-void lserk4::Step(occa::memory &o_q, dfloat time, dfloat _dt) {
+void lserk4::Step(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat time, dfloat _dt) {
 
   // Low storage explicit Runge Kutta (5 stages, 4th order)
   for(int rk=0;rk<Nrk;++rk){
@@ -143,16 +145,6 @@ void lserk4::Step(occa::memory &o_q, dfloat time, dfloat _dt) {
   }
 }
 
-lserk4::~lserk4() {
-  if (o_rhsq.size()) o_rhsq.free();
-  if (o_resq.size()) o_resq.free();
-
-  if (rka) free(rka);
-  if (rkb) free(rkb);
-  if (rkc) free(rkc);
-
-  updateKernel.free();
-}
 
 /**************************************************/
 /* PML version                                    */
@@ -160,23 +152,20 @@ lserk4::~lserk4() {
 
 lserk4_pml::lserk4_pml(dlong _Nelements, dlong _NpmlElements, dlong _NhaloElements,
                       int _Np, int _Nfields, int _Npmlfields,
-                      solver_t& _solver):
-  lserk4(_Nelements, _NhaloElements, _Np, _Nfields, _solver),
+                      platform_t& _platform, comm_t _comm):
+  lserk4(_Nelements, _NhaloElements, _Np, _Nfields, _platform, _comm),
   Npml(_Npmlfields*_Np*_NpmlElements) {
 
   if (Npml) {
-    platform_t &platform = solver.platform;
-
-    dfloat *pmlq = (dfloat *) calloc(Npml,sizeof(dfloat));
-    o_pmlq   = platform.malloc(Npml*sizeof(dfloat), pmlq);
-    free(pmlq);
+    memory<dfloat> pmlq(Npml,0.0);
+    o_pmlq = platform.malloc<dfloat>(pmlq);
 
-    o_respmlq = platform.malloc(Npml*sizeof(dfloat));
-    o_rhspmlq = platform.malloc(Npml*sizeof(dfloat));
+    o_respmlq = platform.malloc<dfloat>(Npml);
+    o_rhspmlq = platform.malloc<dfloat>(Npml);
   }
 }
 
-void lserk4_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt) {
+void lserk4_pml::Step(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat time, dfloat _dt) {
 
   // Low storage explicit Runge Kutta (5 stages, 4th order)
   for(int rk=0;rk<Nrk;++rk){
@@ -195,10 +184,6 @@ void lserk4_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt) {
   }
 }
 
-lserk4_pml::~lserk4_pml() {
-  if (o_pmlq.size()) o_pmlq.free();
-  if (o_rhspmlq.size()) o_rhspmlq.free();
-  if (o_respmlq.size()) o_respmlq.free();
-}
-
 } //namespace TimeStepper
+
+} //namespace libp
diff --git a/libs/timeStepper/timeStepperMRAB3.cpp b/libs/timeStepper/timeStepperMRAB3.cpp
index d80238b64..e6a1f761f 100644
--- a/libs/timeStepper/timeStepperMRAB3.cpp
+++ b/libs/timeStepper/timeStepperMRAB3.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,39 +27,41 @@ SOFTWARE.
 #include "core.hpp"
 #include "timeStepper.hpp"
 
+namespace libp {
+
 namespace TimeStepper {
 
 mrab3::mrab3(dlong Nelements, dlong NhaloElements,
-               int Np, int _Nfields, solver_t& _solver, mesh_t& _mesh):
-  timeStepper_t(Nelements, NhaloElements, Np, _Nfields, _solver),
+               int Np, int _Nfields,
+               platform_t& _platform, mesh_t& _mesh):
+  timeStepperBase_t(Nelements, NhaloElements,
+                    Np, _Nfields, _platform, _mesh.comm),
   mesh(_mesh),
   Nlevels(mesh.mrNlevels),
   Nfields(_Nfields) {
 
-  platform_t &platform = solver.platform;
-
   Nstages = 3;
 
-  dfloat *rhsq0 = (dfloat*) calloc(N, sizeof(dfloat));
-  o_rhsq0 = platform.malloc(N*sizeof(dfloat), rhsq0);
-  free(rhsq0);
+  memory<dfloat> rhsq0(N, 0.0);
+  o_rhsq0 = platform.malloc<dfloat>(rhsq0);
+
+  memory<dfloat> rhsq((Nstages-1)*N, 0.0);
+  o_rhsq = platform.malloc<dfloat>(rhsq);
 
-  dfloat *rhsq = (dfloat*) calloc((Nstages-1)*N, sizeof(dfloat));
-  o_rhsq = platform.malloc((Nstages-1)*N*sizeof(dfloat), rhsq);
-  free(rhsq);
+  o_fQM = platform.malloc<dfloat>((mesh.Nelements+mesh.totalHaloPairs)*mesh.Nfp
+                                  *mesh.Nfaces*Nfields);
 
-  o_fQM = platform.malloc((mesh.Nelements+mesh.totalHaloPairs)*mesh.Nfp
-                          *mesh.Nfaces*Nfields*sizeof(dfloat));
+  properties_t kernelInfo = platform.props(); //copy base occa properties from solver
 
-  occa::properties kernelInfo = platform.props; //copy base occa properties from solver
+  const int blocksize=256;
 
-  kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE;
+  kernelInfo["defines/" "p_blockSize"] = blocksize;
   kernelInfo["defines/" "p_Nstages"] = Nstages;
   kernelInfo["defines/" "p_Np"] = mesh.Np;
   kernelInfo["defines/" "p_Nfp"] = mesh.Nfp;
   kernelInfo["defines/" "p_Nfaces"] = mesh.Nfaces;
   kernelInfo["defines/" "p_Nfields"] = Nfields;
-  int maxNodes = mymax(mesh.Np, mesh.Nfp*mesh.Nfaces);
+  int maxNodes = std::max(mesh.Np, mesh.Nfp*mesh.Nfaces);
   kernelInfo["defines/" "p_maxNodes"] = maxNodes;
 
   updateKernel = platform.buildKernel(TIMESTEPPER_DIR "/okl/"
@@ -81,37 +83,36 @@ mrab3::mrab3(dlong Nelements, dlong NhaloElements,
                          5./8.,   -1./8.,    0.0,
                        17./24.,  -7./24., 2./24.};
 
-  ab_a = (dfloat*) calloc(Nstages*Nstages, sizeof(dfloat));
-  ab_b = (dfloat*) calloc(Nstages*Nstages, sizeof(dfloat));
-  memcpy(ab_a, _ab_a, Nstages*Nstages*sizeof(dfloat));
-  memcpy(ab_b, _ab_b, Nstages*Nstages*sizeof(dfloat));
+  ab_a.malloc(Nstages*Nstages);
+  ab_b.malloc(Nstages*Nstages);
+  ab_a.copyFrom(_ab_a);
+  ab_b.copyFrom(_ab_b);
 
-  shiftIndex = (int*) platform.hostMalloc(Nlevels*sizeof(int),
-                                          NULL, h_shiftIndex);
-  o_shiftIndex = platform.malloc(Nlevels*sizeof(int));
+  h_shiftIndex = platform.hostMalloc<int>(Nlevels);
+  o_shiftIndex = platform.malloc<int>(Nlevels);
 
-  mrdt = (dfloat*) calloc(Nlevels, sizeof(dfloat));
-  o_mrdt = platform.malloc(Nlevels*sizeof(dfloat), mrdt);
+  mrdt.malloc(Nlevels, 0.0);
+  o_mrdt = platform.malloc<dfloat>(mrdt);
 
-  o_ab_a = platform.malloc(Nstages*Nstages*sizeof(dfloat), ab_a);
-  o_ab_b = platform.malloc(Nstages*Nstages*sizeof(dfloat), ab_b);
+  o_ab_a = platform.malloc<dfloat>(ab_a);
+  o_ab_b = platform.malloc<dfloat>(ab_b);
 }
 
-void mrab3::Run(occa::memory &o_q, dfloat start, dfloat end) {
+void mrab3::Run(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat start, dfloat end) {
 
   dfloat time = start;
 
   //set timesteps and shifting index
   for (int lev=0;lev<Nlevels;lev++) {
     mrdt[lev] = dt*(1 << lev);
-    shiftIndex[lev] = 0;
+    h_shiftIndex[lev] = 0;
   }
   o_mrdt.copyFrom(mrdt);
-  o_shiftIndex.copyFrom(shiftIndex);
+  h_shiftIndex.copyTo(o_shiftIndex);
 
   solver.Report(time,0);
 
-  dfloat outputInterval;
+  dfloat outputInterval=0.0;
   solver.settings.getSetting("OUTPUT INTERVAL", outputInterval);
 
   dfloat outputTime = time + outputInterval;
@@ -135,7 +136,7 @@ void mrab3::Run(occa::memory &o_q, dfloat start, dfloat end) {
   int tstep=0;
   int order=0;
   while (time < end) {
-    Step(o_q, time, dt, order);
+    Step(solver, o_q, time, dt, order);
     time += DT;
     tstep++;
     if (order<Nstages-1) order++;
@@ -148,10 +149,10 @@ void mrab3::Run(occa::memory &o_q, dfloat start, dfloat end) {
   }
 }
 
-void mrab3::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) {
+void mrab3::Step(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat time, dfloat _dt, int order) {
 
-  occa::memory o_A = o_ab_a+order*Nstages*sizeof(dfloat);
-  occa::memory o_B = o_ab_b+order*Nstages*sizeof(dfloat);
+  deviceMemory<dfloat> o_A = o_ab_a+order*Nstages;
+  deviceMemory<dfloat> o_B = o_ab_b+order*Nstages;
 
   for (int Ntick=0; Ntick < (1 << (Nlevels-1));Ntick++) {
 
@@ -186,7 +187,7 @@ void mrab3::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) {
     //rotate index
     if (Nstages>2)
       for (int l=0; l<=lev; l++)
-        shiftIndex[l] = (shiftIndex[l]+Nstages-2)%(Nstages-1);
+        h_shiftIndex[l] = (h_shiftIndex[l]+Nstages-2)%(Nstages-1);
 
     //compute intermediate trace values on lev+1 / lev interface
     if (lev+1<Nlevels && mesh.mrInterfaceNelements[lev+1])
@@ -203,57 +204,45 @@ void mrab3::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) {
                         o_q,
                         o_fQM);
 
-    // o_shiftIndex.copyFrom(shiftIndex, "async: true");
-    o_shiftIndex.copyFrom(shiftIndex); //Required to keep the update kernel overlapping the transfer,
+    // o_shiftIndex.copyFrom(h_shiftIndex, properties_t("async", true));
+    h_shiftIndex.copyTo(o_shiftIndex); //Required to keep the update kernel overlapping the transfer,
                                        // but why does that happen?
   }
 }
 
-mrab3::~mrab3() {
-  if (o_rhsq.size()) o_rhsq.free();
-  if (o_fQM.size()) o_fQM.free();
-
-  if (ab_a) free(ab_a);
-  if (ab_b) free(ab_b);
-
-  updateKernel.free();
-  traceUpdateKernel.free();
-}
-
 /**************************************************/
 /* PML version                                    */
 /**************************************************/
 
 mrab3_pml::mrab3_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements,
-               int Np, int _Nfields, int _Npmlfields, solver_t& _solver, mesh_t& _mesh):
-  mrab3(Nelements, NhaloElements, Np, _Nfields, _solver, _mesh),
+                     int Np, int _Nfields, int _Npmlfields,
+                     platform_t& _platform, mesh_t& _mesh):
+  mrab3(Nelements, NhaloElements,
+        Np, _Nfields, _platform, _mesh),
   Npml(NpmlElements*Np*_Npmlfields),
   Npmlfields(_Npmlfields) {
 
   if (Npml) {
-    platform_t &platform = solver.platform;
+    memory<dfloat> pmlq(Npml, 0.0);
+    o_pmlq = platform.malloc<dfloat>(pmlq);
 
-    dfloat *pmlq = (dfloat*) calloc(Npml, sizeof(dfloat));
-    o_pmlq = platform.malloc(Npml*sizeof(dfloat), pmlq);
-    free(pmlq);
+    memory<dfloat> rhspmlq0(Npml, 0.0);
+    o_rhspmlq0 = platform.malloc<dfloat>(rhspmlq0);
 
-    dfloat *rhspmlq0 = (dfloat*) calloc(Npml, sizeof(dfloat));
-    o_rhspmlq0 = platform.malloc(Npml*sizeof(dfloat), rhspmlq0);
-    free(rhspmlq0);
+    memory<dfloat> rhspmlq((Nstages-1)*Npml, 0.0);
+    o_rhspmlq = platform.malloc<dfloat>(rhspmlq);
 
-    dfloat *rhspmlq = (dfloat*) calloc((Nstages-1)*Npml, sizeof(dfloat));
-    o_rhspmlq = platform.malloc((Nstages-1)*Npml*sizeof(dfloat), rhspmlq);
-    free(rhspmlq);
+    properties_t kernelInfo = platform.props(); //copy base occa properties from solver
 
-    occa::properties kernelInfo = platform.props; //copy base occa properties from solver
+    const int blocksize=256;
 
-    kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE;
+    kernelInfo["defines/" "p_blockSize"] = blocksize;
     kernelInfo["defines/" "p_Nstages"] = Nstages;
     kernelInfo["defines/" "p_Np"] = mesh.Np;
     kernelInfo["defines/" "p_Nfp"] = mesh.Nfp;
     kernelInfo["defines/" "p_Nfaces"] = mesh.Nfaces;
     kernelInfo["defines/" "p_Nfields"] = Nfields;
-    int maxNodes = mymax(mesh.Np, mesh.Nfp*mesh.Nfaces);
+    int maxNodes = std::max(mesh.Np, mesh.Nfp*mesh.Nfaces);
     kernelInfo["defines/" "p_maxNodes"] = maxNodes;
 
     pmlUpdateKernel = platform.buildKernel(TIMESTEPPER_DIR "/okl/"
@@ -263,10 +252,10 @@ mrab3_pml::mrab3_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements,
   }
 }
 
-void mrab3_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) {
+void mrab3_pml::Step(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat time, dfloat _dt, int order) {
 
-  occa::memory o_A = o_ab_a+order*Nstages*sizeof(dfloat);
-  occa::memory o_B = o_ab_b+order*Nstages*sizeof(dfloat);
+  deviceMemory<dfloat> o_A = o_ab_a+order*Nstages;
+  deviceMemory<dfloat> o_B = o_ab_b+order*Nstages;
 
   for (int Ntick=0; Ntick < (1 << (Nlevels-1));Ntick++) {
 
@@ -317,7 +306,7 @@ void mrab3_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) {
     //rotate index
     if (Nstages>2)
       for (int l=0; l<=lev; l++)
-        shiftIndex[l] = (shiftIndex[l]+Nstages-2)%(Nstages-1);
+        h_shiftIndex[l] = (h_shiftIndex[l]+Nstages-2)%(Nstages-1);
 
     //compute intermediate trace values on lev+1 / lev interface
     if (lev+1<Nlevels && mesh.mrInterfaceNelements[lev+1])
@@ -334,18 +323,12 @@ void mrab3_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) {
                         o_q,
                         o_fQM);
 
-    // o_shiftIndex.copyFrom(shiftIndex, "async: true");
-    o_shiftIndex.copyFrom(shiftIndex); //Required to keep the update kernel overlapping the transfer,
+    // o_shiftIndex.copyFrom(h_shiftIndex, properties_t("async", true));
+    h_shiftIndex.copyTo(o_shiftIndex); //Required to keep the update kernel overlapping the transfer,
                                        // but why does that happen?
   }
 }
 
-mrab3_pml::~mrab3_pml() {
-  if (o_pmlq.size()) o_pmlq.free();
-  if (o_rhspmlq0.size()) o_rhspmlq0.free();
-  if (o_rhspmlq.size()) o_rhspmlq.free();
-
-  pmlUpdateKernel.free();
-}
-
 } //namespace TimeStepper
+
+} //namespace libp
diff --git a/libs/timeStepper/timeStepperMRSAAB3.cpp b/libs/timeStepper/timeStepperMRSAAB3.cpp
index 1de0dbe2b..5db1a5558 100644
--- a/libs/timeStepper/timeStepperMRSAAB3.cpp
+++ b/libs/timeStepper/timeStepperMRSAAB3.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -28,45 +28,47 @@ SOFTWARE.
 #include "timeStepper.hpp"
 #include <complex>
 
+namespace libp {
+
 namespace TimeStepper {
 
 using std::complex;
 
 mrsaab3::mrsaab3(dlong _Nelements, dlong _NhaloElements,
              int _Np, int _Nfields,
-             dfloat *_lambda, solver_t& _solver, mesh_t& _mesh):
-  timeStepper_t(_Nelements, _NhaloElements, _Np, _Nfields, _solver),
+             memory<dfloat> _lambda,
+             platform_t& _platform, mesh_t& _mesh):
+  timeStepperBase_t(_Nelements, _NhaloElements, _Np, _Nfields,
+                    _platform, _mesh.comm),
   mesh(_mesh),
   Nlevels(mesh.mrNlevels),
   Nfields(_Nfields) {
 
-  platform_t &platform = solver.platform;
-
-  lambda = (dfloat *) malloc(Nfields*sizeof(dfloat));
-  memcpy(lambda, _lambda, Nfields*sizeof(dfloat));
+  lambda.malloc(Nfields);
+  lambda.copyFrom(_lambda);
 
   Nstages = 3;
 
-  dfloat *rhsq0 = (dfloat*) calloc(N, sizeof(dfloat));
-  o_rhsq0 = platform.malloc(N*sizeof(dfloat), rhsq0);
-  free(rhsq0);
+  memory<dfloat> rhsq0(N, 0.0);
+  o_rhsq0 = platform.malloc<dfloat>(rhsq0);
+
+  memory<dfloat> rhsq((Nstages-1)*N, 0.0);
+  o_rhsq = platform.malloc<dfloat>(rhsq);
 
-  dfloat *rhsq = (dfloat*) calloc((Nstages-1)*N, sizeof(dfloat));
-  o_rhsq = platform.malloc((Nstages-1)*N*sizeof(dfloat), rhsq);
-  free(rhsq);
+  o_fQM = platform.malloc<dfloat>((mesh.Nelements+mesh.totalHaloPairs)*mesh.Nfp
+                                  *mesh.Nfaces*Nfields);
 
-  o_fQM = platform.malloc((mesh.Nelements+mesh.totalHaloPairs)*mesh.Nfp
-                          *mesh.Nfaces*Nfields*sizeof(dfloat));
+  properties_t kernelInfo = platform.props(); //copy base occa properties from solver
 
-  occa::properties kernelInfo = platform.props; //copy base occa properties from solver
+  const int blocksize=256;
 
-  kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE;
+  kernelInfo["defines/" "p_blockSize"] = blocksize;
   kernelInfo["defines/" "p_Nstages"] = Nstages;
   kernelInfo["defines/" "p_Np"] = mesh.Np;
   kernelInfo["defines/" "p_Nfp"] = mesh.Nfp;
   kernelInfo["defines/" "p_Nfaces"] = mesh.Nfaces;
   kernelInfo["defines/" "p_Nfields"] = Nfields;
-  int maxNodes = mymax(mesh.Np, mesh.Nfp*mesh.Nfaces);
+  int maxNodes = std::max(mesh.Np, mesh.Nfp*mesh.Nfaces);
   kernelInfo["defines/" "p_maxNodes"] = maxNodes;
 
   updateKernel = platform.buildKernel(TIMESTEPPER_DIR "/okl/"
@@ -78,37 +80,36 @@ mrsaab3::mrsaab3(dlong _Nelements, dlong _NhaloElements,
                                     "mrsaabTraceUpdate",
                                     kernelInfo);
 
-  saab_x = (dfloat*) calloc(Nlevels*Nfields, sizeof(dfloat));
-  saab_a = (dfloat*) calloc(Nlevels*Nfields*Nstages*Nstages, sizeof(dfloat));
-  saab_b = (dfloat*) calloc(Nlevels*Nfields*Nstages*Nstages, sizeof(dfloat));
+  saab_x.malloc(Nlevels*Nfields);
+  saab_a.malloc(Nlevels*Nfields*Nstages*Nstages);
+  saab_b.malloc(Nlevels*Nfields*Nstages*Nstages);
 
-  shiftIndex = (int*) platform.hostMalloc(Nlevels*sizeof(int),
-                                          NULL, h_shiftIndex);
-  o_shiftIndex = platform.malloc(Nlevels*sizeof(int));
+  h_shiftIndex = platform.hostMalloc<int>(Nlevels);
+  o_shiftIndex = platform.malloc<int>(Nlevels);
 
-  mrdt = (dfloat*) calloc(Nlevels, sizeof(dfloat));
-  o_mrdt = platform.malloc(Nlevels*sizeof(dfloat), mrdt);
+  mrdt.malloc(Nlevels, 0.0);
+  o_mrdt = platform.malloc<dfloat>(mrdt);
 
-  o_saab_x = platform.malloc(Nlevels*Nfields*sizeof(dfloat));
-  o_saab_a = platform.malloc(Nlevels*Nfields*Nstages*Nstages*sizeof(dfloat));
-  o_saab_b = platform.malloc(Nlevels*Nfields*Nstages*Nstages*sizeof(dfloat));
+  o_saab_x = platform.malloc<dfloat>(Nlevels*Nfields);
+  o_saab_a = platform.malloc<dfloat>(Nlevels*Nfields*Nstages*Nstages);
+  o_saab_b = platform.malloc<dfloat>(Nlevels*Nfields*Nstages*Nstages);
 }
 
-void mrsaab3::Run(occa::memory &o_q, dfloat start, dfloat end) {
+void mrsaab3::Run(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat start, dfloat end) {
 
   dfloat time = start;
 
   //set timesteps and shifting index
   for (int lev=0;lev<Nlevels;lev++) {
     mrdt[lev] = dt*(1 << lev);
-    shiftIndex[lev] = 0;
+    h_shiftIndex[lev] = 0;
   }
   o_mrdt.copyFrom(mrdt);
-  o_shiftIndex.copyFrom(shiftIndex);
+  h_shiftIndex.copyTo(o_shiftIndex);
 
   solver.Report(time,0);
 
-  dfloat outputInterval;
+  dfloat outputInterval=0.0;
   solver.settings.getSetting("OUTPUT INTERVAL", outputInterval);
 
   dfloat outputTime = time + outputInterval;
@@ -136,7 +137,7 @@ void mrsaab3::Run(occa::memory &o_q, dfloat start, dfloat end) {
   int tstep=0;
   int order=0;
   while (time < end) {
-    Step(o_q, time, dt, order);
+    Step(solver, o_q, time, dt, order);
     time += DT;
     tstep++;
     if (order<Nstages-1) order++;
@@ -149,10 +150,10 @@ void mrsaab3::Run(occa::memory &o_q, dfloat start, dfloat end) {
   }
 }
 
-void mrsaab3::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) {
+void mrsaab3::Step(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat time, dfloat _dt, int order) {
 
-  occa::memory o_A = o_saab_a+order*Nstages*sizeof(dfloat);
-  occa::memory o_B = o_saab_b+order*Nstages*sizeof(dfloat);
+  deviceMemory<dfloat> o_A = o_saab_a+order*Nstages;
+  deviceMemory<dfloat> o_B = o_saab_b+order*Nstages;
 
   for (int Ntick=0; Ntick < (1 << (Nlevels-1));Ntick++) {
 
@@ -188,7 +189,7 @@ void mrsaab3::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) {
     //rotate index
     if (Nstages>2)
       for (int l=0; l<=lev; l++)
-        shiftIndex[l] = (shiftIndex[l]+Nstages-2)%(Nstages-1);
+        h_shiftIndex[l] = (h_shiftIndex[l]+Nstages-2)%(Nstages-1);
 
     //compute intermediate trace values on lev+1 / lev interface
     if (lev+1<Nlevels && mesh.mrInterfaceNelements[lev+1])
@@ -206,8 +207,8 @@ void mrsaab3::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) {
                         o_q,
                         o_fQM);
 
-    // o_shiftIndex.copyFrom(shiftIndex, "async: true");
-    o_shiftIndex.copyFrom(shiftIndex); //Required to keep the update kernel overlapping the transfer,
+    // o_shiftIndex.copyFrom(h_shiftIndex, properties_t("async", true));
+    h_shiftIndex.copyTo(o_shiftIndex); //Required to keep the update kernel overlapping the transfer,
                                        // but why does that happen?
   }
 }
@@ -241,12 +242,9 @@ void mrsaab3::UpdateCoefficients() {
                            5./8.,   -1./8.,    0.0,
                          17./24.,  -7./24., 2./24.};
 
-        memcpy(saab_x+n                +lev*Nfields,
-              _saab_X, 1*sizeof(dfloat));
-        memcpy(saab_a+n*Nstages*Nstages+lev*Nfields*Nstages*Nstages,
-              _saab_A,Nstages*Nstages*sizeof(dfloat));
-        memcpy(saab_b+n*Nstages*Nstages+lev*Nfields*Nstages*Nstages,
-              _saab_B,Nstages*Nstages*sizeof(dfloat));
+        saab_x.copyFrom(_saab_X,               1, n                +lev*Nfields);
+        saab_a.copyFrom(_saab_A, Nstages*Nstages, n*Nstages*Nstages+lev*Nfields*Nstages*Nstages);
+        saab_b.copyFrom(_saab_B, Nstages*Nstages, n*Nstages*Nstages+lev*Nfields*Nstages*Nstages);
 
       } else {
 
@@ -304,7 +302,7 @@ void mrsaab3::UpdateCoefficients() {
         dfloat bb32=real(b32)/ (double) Nr;
         dfloat bb33=real(b33)/ (double) Nr;
 
-        dfloat _saab_X[1]  = { exp(alpha) };
+        dfloat _saab_X[1]  = { std::exp(alpha) };
         dfloat _saab_A[Nstages*Nstages]
                         ={   aa11,   0.0,   0.0,
                              aa21,  aa22,   0.0,
@@ -314,12 +312,9 @@ void mrsaab3::UpdateCoefficients() {
                              bb21,  bb22,   0.0,
                              bb31,  bb32,  bb33 };
 
-        memcpy(saab_x+n                +lev*Nfields,
-              _saab_X, 1*sizeof(dfloat));
-        memcpy(saab_a+n*Nstages*Nstages+lev*Nfields*Nstages*Nstages,
-              _saab_A,Nstages*Nstages*sizeof(dfloat));
-        memcpy(saab_b+n*Nstages*Nstages+lev*Nfields*Nstages*Nstages,
-              _saab_B,Nstages*Nstages*sizeof(dfloat));
+        saab_x.copyFrom(_saab_X,               1, n                +lev*Nfields);
+        saab_a.copyFrom(_saab_A, Nstages*Nstages, n*Nstages*Nstages+lev*Nfields*Nstages*Nstages);
+        saab_b.copyFrom(_saab_B, Nstages*Nstages, n*Nstages*Nstages+lev*Nfields*Nstages*Nstages);
       }
     }
 
@@ -330,57 +325,40 @@ void mrsaab3::UpdateCoefficients() {
   }
 }
 
-mrsaab3::~mrsaab3() {
-  if (o_rhsq.size()) o_rhsq.free();
-  if (o_fQM.size()) o_fQM.free();
-
-  if (saab_x) free(saab_x);
-  if (saab_a) free(saab_a);
-  if (saab_b) free(saab_b);
-
-  if (o_saab_x.size()) o_saab_x.free();
-  if (o_saab_a.size()) o_saab_a.free();
-  if (o_saab_b.size()) o_saab_b.free();
-
-  updateKernel.free();
-  traceUpdateKernel.free();
-}
-
 /**************************************************/
 /* PML version                                    */
 /**************************************************/
 
 mrsaab3_pml::mrsaab3_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElements,
                          int Np, int _Nfields, int _Npmlfields,
-                         dfloat *_lambda, solver_t& _solver, mesh_t& _mesh):
-  mrsaab3(Nelements, NhaloElements, Np, _Nfields, _lambda, _solver, _mesh),
+                         memory<dfloat> _lambda,
+                         platform_t& _platform, mesh_t& _mesh):
+  mrsaab3(Nelements, NhaloElements,
+          Np, _Nfields, _lambda, _platform, _mesh),
   Npml(NpmlElements*Np*_Npmlfields),
   Npmlfields(_Npmlfields) {
 
   if (Npml) {
-    platform_t &platform = solver.platform;
+    memory<dfloat> pmlq(Npml, 0.0);
+    o_pmlq = platform.malloc<dfloat>(pmlq);
 
-    dfloat *pmlq = (dfloat*) calloc(Npml, sizeof(dfloat));
-    o_pmlq = platform.malloc(Npml*sizeof(dfloat), pmlq);
-    free(pmlq);
+    memory<dfloat> rhspmlq0(Npml, 0.0);
+    o_rhspmlq0 = platform.malloc<dfloat>(rhspmlq0);
 
-    dfloat *rhspmlq0 = (dfloat*) calloc(Npml, sizeof(dfloat));
-    o_rhspmlq0 = platform.malloc(Npml*sizeof(dfloat), rhspmlq0);
-    free(rhspmlq0);
+    memory<dfloat> rhspmlq((Nstages-1)*Npml, 0.0);
+    o_rhspmlq = platform.malloc<dfloat>(rhspmlq);
 
-    dfloat *rhspmlq = (dfloat*) calloc((Nstages-1)*Npml, sizeof(dfloat));
-    o_rhspmlq = platform.malloc((Nstages-1)*Npml*sizeof(dfloat), rhspmlq);
-    free(rhspmlq);
+    properties_t kernelInfo = platform.props(); //copy base occa properties from solver
 
-    occa::properties kernelInfo = platform.props; //copy base occa properties from solver
+    const int blocksize=256;
 
-    kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE;
+    kernelInfo["defines/" "p_blockSize"] = blocksize;
     kernelInfo["defines/" "p_Nstages"] = Nstages;
     kernelInfo["defines/" "p_Np"] = mesh.Np;
     kernelInfo["defines/" "p_Nfp"] = mesh.Nfp;
     kernelInfo["defines/" "p_Nfaces"] = mesh.Nfaces;
     kernelInfo["defines/" "p_Nfields"] = Nfields;
-    int maxNodes = mymax(mesh.Np, mesh.Nfp*mesh.Nfaces);
+    int maxNodes = std::max(mesh.Np, mesh.Nfp*mesh.Nfaces);
     kernelInfo["defines/" "p_maxNodes"] = maxNodes;
 
     pmlUpdateKernel = platform.buildKernel(TIMESTEPPER_DIR "/okl/"
@@ -398,23 +376,23 @@ mrsaab3_pml::mrsaab3_pml(dlong Nelements, dlong NpmlElements, dlong NhaloElement
                            5./8.,   -1./8.,    0.0,
                          17./24.,  -7./24., 2./24.};
 
-    pmlsaab_a = (dfloat*) calloc(Nstages*Nstages, sizeof(dfloat));
-    pmlsaab_b = (dfloat*) calloc(Nstages*Nstages, sizeof(dfloat));
-    memcpy(pmlsaab_a, _ab_a, Nstages*Nstages*sizeof(dfloat));
-    memcpy(pmlsaab_b, _ab_b, Nstages*Nstages*sizeof(dfloat));
+    pmlsaab_a.malloc(Nstages*Nstages);
+    pmlsaab_b.malloc(Nstages*Nstages);
+    pmlsaab_a.copyFrom(_ab_a);
+    pmlsaab_b.copyFrom(_ab_b);
 
-    o_pmlsaab_a = platform.malloc(Nstages*Nstages*sizeof(dfloat), pmlsaab_a);
-    o_pmlsaab_b = platform.malloc(Nstages*Nstages*sizeof(dfloat), pmlsaab_b);
+    o_pmlsaab_a = platform.malloc<dfloat>(pmlsaab_a);
+    o_pmlsaab_b = platform.malloc<dfloat>(pmlsaab_b);
   }
 }
 
-void mrsaab3_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) {
+void mrsaab3_pml::Step(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat time, dfloat _dt, int order) {
 
-  occa::memory o_A = o_saab_a+order*Nstages*sizeof(dfloat);
-  occa::memory o_B = o_saab_b+order*Nstages*sizeof(dfloat);
+  deviceMemory<dfloat> o_A = o_saab_a+order*Nstages;
+  deviceMemory<dfloat> o_B = o_saab_b+order*Nstages;
 
-  occa::memory o_pmlA;
-  if (Npml) o_pmlA = o_pmlsaab_a+order*Nstages*sizeof(dfloat);
+  deviceMemory<dfloat> o_pmlA;
+  if (Npml) o_pmlA = o_pmlsaab_a+order*Nstages;
 
   for (int Ntick=0; Ntick < (1 << (Nlevels-1));Ntick++) {
 
@@ -466,7 +444,7 @@ void mrsaab3_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) {
     //rotate index
     if (Nstages>2)
       for (int l=0; l<=lev; l++)
-        shiftIndex[l] = (shiftIndex[l]+Nstages-2)%(Nstages-1);
+        h_shiftIndex[l] = (h_shiftIndex[l]+Nstages-2)%(Nstages-1);
 
     //compute intermediate trace values on lev+1 / lev interface
     if (lev+1<Nlevels && mesh.mrInterfaceNelements[lev+1])
@@ -484,18 +462,12 @@ void mrsaab3_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) {
                         o_q,
                         o_fQM);
 
-    // o_shiftIndex.copyFrom(shiftIndex, "async: true");
-    o_shiftIndex.copyFrom(shiftIndex); //Required to keep the update kernel overlapping the transfer,
+    // o_shiftIndex.copyFrom(h_shiftIndex, properties_t("async", true));
+    h_shiftIndex.copyTo(o_shiftIndex); //Required to keep the update kernel overlapping the transfer,
                                        // but why does that happen?
   }
 }
 
-mrsaab3_pml::~mrsaab3_pml() {
-  if (o_pmlq.size()) o_pmlq.free();
-  if (o_rhspmlq0.size()) o_rhspmlq0.free();
-  if (o_rhspmlq.size()) o_rhspmlq.free();
-
-  pmlUpdateKernel.free();
-}
+} //namespace TimeStepper
 
-} //namespace TimeStepper
\ No newline at end of file
+} //namespace libp
diff --git a/libs/timeStepper/timeStepperSAAB3.cpp b/libs/timeStepper/timeStepperSAAB3.cpp
index c77cb08ea..a0cf156c9 100644
--- a/libs/timeStepper/timeStepperSAAB3.cpp
+++ b/libs/timeStepper/timeStepperSAAB3.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -28,32 +28,36 @@ SOFTWARE.
 #include "timeStepper.hpp"
 #include <complex>
 
+namespace libp {
+
 namespace TimeStepper {
 
 using std::complex;
 
 saab3::saab3(dlong _Nelements, dlong _NhaloElements,
              int _Np, int _Nfields,
-             dfloat *_lambda, solver_t& _solver):
-  timeStepper_t(_Nelements, _NhaloElements, _Np, _Nfields, _solver),
+             memory<dfloat> _lambda,
+             platform_t& _platform, comm_t _comm):
+  timeStepperBase_t(_Nelements, _NhaloElements,
+                    _Np, _Nfields, _platform, _comm),
   Np(_Np),
   Nfields(_Nfields),
   Nelements(_Nelements),
   NhaloElements(_NhaloElements) {
 
-  platform_t &platform = solver.platform;
-
-  lambda = (dfloat *) malloc(Nfields*sizeof(dfloat));
-  memcpy(lambda, _lambda, Nfields*sizeof(dfloat));
+  lambda.malloc(Nfields);
+  lambda.copyFrom(_lambda);
 
   Nstages = 3;
   shiftIndex = 0;
 
-  o_rhsq = platform.malloc(Nstages*N*sizeof(dfloat));
+  o_rhsq = platform.malloc<dfloat>(Nstages*N);
+
+  const int blocksize=256;
 
-  occa::properties kernelInfo = platform.props; //copy base occa properties from solver
+  properties_t kernelInfo = platform.props(); //copy base occa properties from solver
 
-  kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE;
+  kernelInfo["defines/" "p_blockSize"] = blocksize;
   kernelInfo["defines/" "p_Nstages"] = Nstages;
   kernelInfo["defines/" "p_Np"]      = (int)Np;
   kernelInfo["defines/" "p_Nfields"] = (int)Nfields;
@@ -63,20 +67,20 @@ saab3::saab3(dlong _Nelements, dlong _NhaloElements,
                                     "saabUpdate",
                                     kernelInfo);
 
-  saab_x = (dfloat*) malloc(Nfields*sizeof(dfloat));
-  o_saab_x = platform.malloc(Nfields*sizeof(dfloat));
+  h_saab_x = platform.hostMalloc<dfloat>(Nfields);
+  o_saab_x = platform.malloc<dfloat>(Nfields);
 
-  saab_a = (dfloat*) malloc(Nfields*Nstages*Nstages*sizeof(dfloat));
-  o_saab_a =  platform.malloc(Nfields*Nstages*Nstages*sizeof(dfloat));
+  h_saab_a = platform.hostMalloc<dfloat>(Nfields*Nstages*Nstages);
+  o_saab_a = platform.malloc<dfloat>(Nfields*Nstages*Nstages);
 }
 
-void saab3::Run(occa::memory &o_q, dfloat start, dfloat end) {
+void saab3::Run(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat start, dfloat end) {
 
   dfloat time = start;
 
   solver.Report(time,0);
 
-  dfloat outputInterval;
+  dfloat outputInterval=0.0;
   solver.settings.getSetting("OUTPUT INTERVAL", outputInterval);
 
   dfloat outputTime = time + outputInterval;
@@ -87,7 +91,7 @@ void saab3::Run(occa::memory &o_q, dfloat start, dfloat end) {
   int tstep=0;
   int order=0;
   while (time < end) {
-    Step(o_q, time, dt, order);
+    Step(solver, o_q, time, dt, order);
     time += dt;
     tstep++;
     if (order<Nstages-1) order++;
@@ -100,14 +104,14 @@ void saab3::Run(occa::memory &o_q, dfloat start, dfloat end) {
   }
 }
 
-void saab3::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) {
+void saab3::Step(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat time, dfloat _dt, int order) {
 
   //rhs at current index
-  occa::memory o_rhsq0 = o_rhsq + shiftIndex*N*sizeof(dfloat);
+  deviceMemory<dfloat> o_rhsq0 = o_rhsq + shiftIndex*N;
 
   //coefficients at current order
-  occa::memory o_X = o_saab_x;
-  occa::memory o_A = o_saab_a + order*Nstages*sizeof(dfloat);
+  deviceMemory<dfloat> o_X = o_saab_x;
+  deviceMemory<dfloat> o_A = o_saab_a + order*Nstages;
 
   //evaluate ODE rhs = f(q,t)
   solver.rhsf(o_q, o_rhsq0, time);
@@ -147,8 +151,8 @@ void saab3::UpdateCoefficients() {
                          3./2.,   -1./2.,    0.0,
                        23./12., -16./12., 5./12.};
 
-      memcpy(saab_x+n                ,_saab_X,    1*sizeof(dfloat));
-      memcpy(saab_a+n*Nstages*Nstages,_saab_A,Nstages*Nstages*sizeof(dfloat));
+      h_saab_x.copyFrom(_saab_X,               1, n                );
+      h_saab_a.copyFrom(_saab_A, Nstages*Nstages, n*Nstages*Nstages);
 
     } else {
 
@@ -186,30 +190,22 @@ void saab3::UpdateCoefficients() {
       dfloat aa32=real(a32)/ (double) Nr;
       dfloat aa33=real(a33)/ (double) Nr;
 
-      dfloat _saab_X[1]  = { exp(alpha) };
+      dfloat _saab_X[1]  = { std::exp(alpha) };
       dfloat _saab_A[Nstages*Nstages]
                       ={   aa11,   0.0,   0.0,
                            aa21,  aa22,   0.0,
                            aa31,  aa32,  aa33 };
 
-      memcpy(saab_x+n                ,_saab_X,    1*sizeof(dfloat));
-      memcpy(saab_a+n*Nstages*Nstages,_saab_A,Nstages*Nstages*sizeof(dfloat));
+      h_saab_x.copyFrom(_saab_X,               1, n                );
+      h_saab_a.copyFrom(_saab_A, Nstages*Nstages, n*Nstages*Nstages);
     }
 
     // move data to platform
-    o_saab_x.copyFrom(saab_x);
-    o_saab_a.copyFrom(saab_a);
+    h_saab_x.copyTo(o_saab_x);
+    h_saab_a.copyTo(o_saab_a);
   }
 }
 
-saab3::~saab3() {
-  if (o_rhsq.size()) o_rhsq.free();
-  if (o_saab_x.size()) o_saab_x.free();
-  if (o_saab_a.size()) o_saab_a.free();
-
-  updateKernel.free();
-}
-
 
 /**************************************************/
 /* PML version                                    */
@@ -217,22 +213,22 @@ saab3::~saab3() {
 
 saab3_pml::saab3_pml(dlong _Nelements, dlong _NpmlElements, dlong _NhaloElements,
                      int _Np, int _Nfields, int Npmlfields,
-                     dfloat *_lambda, solver_t& _solver):
-  saab3(_Nelements, _NhaloElements, _Np, _Nfields, _lambda, _solver),
+                     memory<dfloat> _lambda,
+                     platform_t& _platform, comm_t _comm):
+  saab3(_Nelements, _NhaloElements, _Np, _Nfields, _lambda, _platform, _comm),
   Npml(Npmlfields*_Np*_NpmlElements) {
 
   if (Npml) {
-    platform_t &platform = solver.platform;
+    memory<dfloat> pmlq(Npml,0.0);
+    o_pmlq   = platform.malloc<dfloat>(pmlq);
 
-    dfloat *pmlq = (dfloat *) calloc(Npml,sizeof(dfloat));
-    o_pmlq   = platform.malloc(Npml*sizeof(dfloat), pmlq);
-    free(pmlq);
+    o_rhspmlq = platform.malloc<dfloat>(Nstages*Npml);
 
-    o_rhspmlq = platform.malloc(Nstages*Npml*sizeof(dfloat));
+    properties_t kernelInfo = platform.props(); //copy base occa properties from solver
 
-    occa::properties kernelInfo = platform.props; //copy base occa properties from solver
+    const int blocksize=256;
 
-    kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE;
+    kernelInfo["defines/" "p_blockSize"] = blocksize;
     kernelInfo["defines/" "p_Nstages"] = Nstages;
     kernelInfo["defines/" "p_Np"]      = (int)Np;
     kernelInfo["defines/" "p_Nfields"] = (int)Nfields;
@@ -248,29 +244,29 @@ saab3_pml::saab3_pml(dlong _Nelements, dlong _NpmlElements, dlong _NhaloElements
                            3./2.,   -1./2.,    0.0,
                          23./12., -16./12., 5./12.};
 
-    pmlsaab_a = (dfloat*) calloc(Nstages*Nstages, sizeof(dfloat));
-    memcpy(pmlsaab_a, _ab_a, Nstages*Nstages*sizeof(dfloat));
+    pmlsaab_a.malloc(Nstages*Nstages);
+    pmlsaab_a.copyFrom(_ab_a);
 
-    o_pmlsaab_a = platform.malloc(Nstages*Nstages*sizeof(dfloat), pmlsaab_a);
+    o_pmlsaab_a = platform.malloc<dfloat>(pmlsaab_a);
   }
 }
 
 
-void saab3_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) {
+void saab3_pml::Step(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat time, dfloat _dt, int order) {
 
   //rhs at current index
-  occa::memory o_rhsq0    = o_rhsq + shiftIndex*N*sizeof(dfloat);
-  occa::memory o_rhspmlq0;
+  deviceMemory<dfloat> o_rhsq0    = o_rhsq + shiftIndex*N;
+  deviceMemory<dfloat> o_rhspmlq0;
 
 
   //coefficients at current order
-  occa::memory o_X = o_saab_x;
-  occa::memory o_A = o_saab_a + order*Nstages*sizeof(dfloat);
-  occa::memory o_pmlA;
+  deviceMemory<dfloat> o_X = o_saab_x;
+  deviceMemory<dfloat> o_A = o_saab_a + order*Nstages;
+  deviceMemory<dfloat> o_pmlA;
 
   if (Npml) {
-    o_rhspmlq0 = o_rhspmlq + shiftIndex*Npml*sizeof(dfloat);
-    o_pmlA = o_pmlsaab_a + order*Nstages*sizeof(dfloat);
+    o_rhspmlq0 = o_rhspmlq + shiftIndex*Npml;
+    o_pmlA = o_pmlsaab_a + order*Nstages;
   }
 
   //evaluate ODE rhs = f(q,t)
@@ -297,12 +293,6 @@ void saab3_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) {
   shiftIndex = (shiftIndex+Nstages-1)%Nstages;
 }
 
-saab3_pml::~saab3_pml() {
-  if (o_pmlq.size()) o_pmlq.free();
-  if (o_rhspmlq.size()) o_rhspmlq.free();
-  if (o_pmlsaab_a.size()) o_pmlsaab_a.free();
-
-  pmlUpdateKernel.free();
-}
-
 } //namespace TimeStepper
+
+} //namespace libp
diff --git a/libs/timeStepper/timeStepperSARK4.cpp b/libs/timeStepper/timeStepperSARK4.cpp
index 506db1b54..b3749b958 100644
--- a/libs/timeStepper/timeStepperSARK4.cpp
+++ b/libs/timeStepper/timeStepperSARK4.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -29,24 +29,25 @@ SOFTWARE.
 #include "timeStepper.hpp"
 #include <complex>
 
+namespace libp {
+
 namespace TimeStepper {
 
 using std::complex;
 
 sark4::sark4(dlong _Nelements, dlong _NhaloElements,
              int _Np, int _Nfields,
-             dfloat *_lambda, solver_t& _solver, MPI_Comm _comm):
-  timeStepper_t(_Nelements, _NhaloElements, _Np, _Nfields, _solver),
-  comm(_comm),
+             memory<dfloat> _lambda,
+             platform_t& _platform, comm_t _comm):
+  timeStepperBase_t(_Nelements, _NhaloElements, _Np, _Nfields,
+                    _platform, _comm),
   Np(_Np),
   Nfields(_Nfields),
   Nelements(_Nelements),
   NhaloElements(_NhaloElements) {
 
-  platform_t &platform = solver.platform;
-
-  lambda = (dfloat *) malloc(Nfields*sizeof(dfloat));
-  memcpy(lambda, _lambda, Nfields*sizeof(dfloat));
+  lambda.malloc(Nfields);
+  lambda.copyFrom(_lambda);
 
   Nrk = 5;
   order = 4;
@@ -55,26 +56,27 @@ sark4::sark4(dlong _Nelements, dlong _NhaloElements,
   dlong Nlocal = Nelements*Np*Nfields;
   dlong Ntotal = (Nelements+NhaloElements)*Np*Nfields;
 
-  o_rkq    = platform.malloc(Ntotal*sizeof(dfloat));
-  o_rhsq   = platform.malloc(Nlocal*sizeof(dfloat));
-  o_rkrhsq = platform.malloc(Nlocal*Nrk*sizeof(dfloat));
-  o_rkerr  = platform.malloc(Nlocal*sizeof(dfloat));
+  o_rkq    = platform.malloc<dfloat>(Ntotal);
+  o_rhsq   = platform.malloc<dfloat>(Nlocal);
+  o_rkrhsq = platform.malloc<dfloat>(Nlocal*Nrk);
+  o_rkerr  = platform.malloc<dfloat>(Nlocal);
 
-  o_saveq  = platform.malloc(Nlocal*sizeof(dfloat));
+  o_saveq  = platform.malloc<dfloat>(Nlocal);
 
-  Nblock = (N+BLOCKSIZE-1)/BLOCKSIZE;
-  errtmp = (dfloat*) calloc(Nblock, sizeof(dfloat));
-  o_errtmp = platform.malloc(Nblock*sizeof(dfloat));
+  const int blocksize=256;
 
-  hlong gNlocal = Nlocal;
-  hlong gNtotal;
-  MPI_Allreduce(&gNlocal, &gNtotal, 1, MPI_HLONG, MPI_SUM, comm);
+  Nblock = (N+blocksize-1)/blocksize;
+  h_errtmp = platform.hostMalloc<dfloat>(Nblock);
+  o_errtmp = platform.malloc<dfloat>(Nblock);
+
+  hlong gNtotal = Nlocal;
+  comm.Allreduce(gNtotal);
 
   //copy base occa properties from platform
-  occa::properties kernelInfo = solver.platform.props;
+  properties_t kernelInfo = platform.props();
 
   //add defines
-  kernelInfo["defines/" "p_blockSize"] = (int)BLOCKSIZE;
+  kernelInfo["defines/" "p_blockSize"] = (int)blocksize;
   kernelInfo["defines/" "p_Nrk"]     = (int)Nrk;
   kernelInfo["defines/" "p_Np"]      = (int)Np;
   kernelInfo["defines/" "p_Nfields"] = (int)Nfields;
@@ -96,16 +98,16 @@ sark4::sark4(dlong _Nelements, dlong _NhaloElements,
 
   // Semi-Analytic Runge Kutta - order (3) 4 with PID timestep control
   dfloat _rkC[Nrk] = {0.0, 0.5, 0.5, 1.0, 1.0};
-  rkC = (dfloat*) calloc(Nrk, sizeof(dfloat));
-  memcpy(rkC, _rkC, Nrk*sizeof(dfloat));
+  rkC.malloc(Nrk);
+  rkC.copyFrom(_rkC);
 
-  rkX = (dfloat*) platform.hostMalloc(Nfields*Nrk*    sizeof(dfloat), NULL, h_rkX);
-  rkA = (dfloat*) platform.hostMalloc(Nfields*Nrk*Nrk*sizeof(dfloat), NULL, h_rkA);
-  rkE = (dfloat*) platform.hostMalloc(Nfields*Nrk*    sizeof(dfloat), NULL, h_rkE);
+  h_rkX = platform.hostMalloc<dfloat>(Nfields*Nrk);
+  h_rkA = platform.hostMalloc<dfloat>(Nfields*Nrk*Nrk);
+  h_rkE = platform.hostMalloc<dfloat>(Nfields*Nrk);
 
-  o_rkX = platform.malloc(Nfields*Nrk*    sizeof(dfloat));
-  o_rkA = platform.malloc(Nfields*Nrk*Nrk*sizeof(dfloat));
-  o_rkE = platform.malloc(Nfields*Nrk*    sizeof(dfloat));
+  o_rkX = platform.malloc<dfloat>(Nfields*Nrk);
+  o_rkA = platform.malloc<dfloat>(Nfields*Nrk*Nrk);
+  o_rkE = platform.malloc<dfloat>(Nfields*Nrk);
 
   dtMIN = 1E-9; //minumum allowed timestep
   ATOL = 1E-5;  //absolute error tolerance
@@ -124,16 +126,15 @@ sark4::sark4(dlong _Nelements, dlong _NhaloElements,
   sqrtinvNtotal = 1.0/sqrt(gNtotal);
 }
 
-void sark4::Run(occa::memory &o_q, dfloat start, dfloat end) {
+void sark4::Run(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat start, dfloat end) {
 
   dfloat time = start;
 
-  int rank;
-  MPI_Comm_rank(comm, &rank);
+  int rank = comm.rank();
 
   solver.Report(time,0);
 
-  dfloat outputInterval;
+  dfloat outputInterval=0.0;
   solver.settings.getSetting("OUTPUT INTERVAL", outputInterval);
 
   dfloat outputTime = time + outputInterval;
@@ -145,23 +146,17 @@ void sark4::Run(occa::memory &o_q, dfloat start, dfloat end) {
 
   while (time < end) {
 
-    if (dt<dtMIN){
-      stringstream ss;
-      ss << "Time step became too small at time step = " << tstep;
-      LIBP_ABORT(ss.str());
-    }
-    if (std::isnan(dt)) {
-      stringstream ss;
-      ss << "Solution became unstable at time step = " << tstep;
-      LIBP_ABORT(ss.str());
-    }
+    LIBP_ABORT("Time step became too small at time step = " << tstep,
+               dt<dtMIN);
+    LIBP_ABORT("Solution became unstable at time step = " << tstep,
+               std::isnan(dt));
 
     //check for final timestep
     if (time+dt > end){
       dt = end-time;
     }
 
-    Step(o_q, time, dt);
+    Step(solver, o_q, time, dt);
 
     // compute Dopri estimator
     dfloat err = Estimater(o_q);
@@ -170,7 +165,7 @@ void sark4::Run(occa::memory &o_q, dfloat start, dfloat end) {
     dfloat fac1 = pow(err,exp1);
     dfloat fac = fac1/pow(facold,beta);
 
-    fac = mymax(invfactor2, mymin(invfactor1,fac/safe));
+    fac = std::max(invfactor2, std::min(invfactor1,fac/safe));
     dfloat dtnew = dt/fac;
 
     if (err<1.0) { //dt is accepted
@@ -192,7 +187,7 @@ void sark4::Run(occa::memory &o_q, dfloat start, dfloat end) {
         UpdateCoefficients();
 
         // time step to output
-        Step(o_q, time, dt);
+        Step(solver, o_q, time, dt);
 
         // shift for output
         o_rkq.copyTo(o_q);
@@ -219,14 +214,15 @@ void sark4::Run(occa::memory &o_q, dfloat start, dfloat end) {
       time += dt;
       while (time>outputTime) outputTime+= outputInterval; //catch up next output in case dt>outputInterval
 
-      facold = mymax(err,1E-4); // hard coded factor ?
+      constexpr dfloat errMax = 1.0e-4;  // hard coded factor ?
+      facold = std::max(err,errMax);
 
       // if (!rank)
       //   printf("\r time = %g (%d), dt = %g accepted                      ", time, allStep,  dt);
 
       tstep++;
     } else {
-      dtnew = dt/(mymax(invfactor1,fac1/safe));
+      dtnew = dt/(std::max(invfactor1,fac1/safe));
 
       // if (!rank)
       //   printf("\r time = %g (%d), dt = %g rejected, trying %g", time, allStep, dt, dtnew);
@@ -245,19 +241,19 @@ void sark4::Run(occa::memory &o_q, dfloat start, dfloat end) {
     printf("%d accepted steps and %d total steps\n", tstep, allStep);
 }
 
-void sark4::Backup(occa::memory &o_Q) {
-  o_saveq.copyFrom(o_Q, N*sizeof(dfloat));
+void sark4::Backup(deviceMemory<dfloat> &o_Q) {
+  o_saveq.copyFrom(o_Q, N);
 }
 
-void sark4::Restore(occa::memory &o_Q) {
-  o_saveq.copyTo(o_Q, N*sizeof(dfloat));
+void sark4::Restore(deviceMemory<dfloat> &o_Q) {
+  o_saveq.copyTo(o_Q, N);
 }
 
-void sark4::AcceptStep(occa::memory &o_q, occa::memory &o_rq) {
-  o_q.copyFrom(o_rq, N*sizeof(dfloat));
+void sark4::AcceptStep(deviceMemory<dfloat> &o_q, deviceMemory<dfloat> &o_rq) {
+  o_q.copyFrom(o_rq, N);
 }
 
-void sark4::Step(occa::memory &o_q, dfloat time, dfloat _dt) {
+void sark4::Step(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat time, dfloat _dt) {
 
   //RK step
   for(int rk=0;rk<Nrk;++rk){
@@ -298,7 +294,7 @@ void sark4::Step(occa::memory &o_q, dfloat time, dfloat _dt) {
   }
 }
 
-dfloat sark4::Estimater(occa::memory& o_q){
+dfloat sark4::Estimater(deviceMemory<dfloat>& o_q){
 
   //Error estimation
   //E. HAIRER, S.P. NORSETT AND G. WANNER, SOLVING ORDINARY
@@ -311,13 +307,12 @@ dfloat sark4::Estimater(occa::memory& o_q){
                         o_rkerr,
                         o_errtmp);
 
-  o_errtmp.copyTo(errtmp);
-  dfloat localerr = 0;
+  h_errtmp.copyFrom(o_errtmp);
   dfloat err = 0;
   for(dlong n=0;n<Nblock;++n){
-    localerr += errtmp[n];
+    err += h_errtmp[n];
   }
-  MPI_Allreduce(&localerr, &err, 1, MPI_DFLOAT, MPI_SUM, comm);
+  comm.Allreduce(err);
 
   err = sqrt(err)*sqrtinvNtotal;
 
@@ -349,9 +344,9 @@ void sark4::UpdateCoefficients() {
                          1.0/6.0,  1.0/3.0,   1.0/3.0,   1.0/6.0,      0.0};
       dfloat _rkE[Nrk]= {    0.0,      0.0,       0.0,  -1.0/6.0,  1.0/6.0};
 
-      memcpy(rkX+n*Nrk    ,_rkX,    Nrk*sizeof(dfloat));
-      memcpy(rkA+n*Nrk*Nrk,_rkA,Nrk*Nrk*sizeof(dfloat));
-      memcpy(rkE+n*Nrk    ,_rkE,    Nrk*sizeof(dfloat));
+      h_rkX.copyFrom(_rkX,    Nrk,n*Nrk    );
+      h_rkA.copyFrom(_rkA,Nrk*Nrk,n*Nrk*Nrk);
+      h_rkE.copyFrom(_rkE,    Nrk,n*Nrk    );
 
     } else {
 
@@ -401,7 +396,11 @@ void sark4::UpdateCoefficients() {
       dfloat a54=real(ca54)/ (double) Nr;
 
       // first set non-semianalytic part of the integrator
-      dfloat _rkX[Nrk]  = {1.0, exp(0.5*alpha), exp(0.5*alpha), exp(alpha), exp(alpha) };
+      dfloat _rkX[Nrk]  = {1.0,
+                           std::exp(dfloat(0.5)*alpha),
+                           std::exp(dfloat(0.5)*alpha),
+                           std::exp(alpha),
+                           std::exp(alpha) };
       dfloat _rkA[Nrk*Nrk]
                       ={   0.0,  0.0,  0.0,   0.0,  0.0,
                            a21,  0.0,  0.0,   0.0,  0.0,
@@ -410,42 +409,21 @@ void sark4::UpdateCoefficients() {
                            a51,  a52,  a53,   a54,  0.0};
       dfloat _rkE[Nrk]= {  0.0,  0.0,  0.0,  -a54,  a54};
 
-      memcpy(rkX+n*Nrk    ,_rkX,    Nrk*sizeof(dfloat));
-      memcpy(rkA+n*Nrk*Nrk,_rkA,Nrk*Nrk*sizeof(dfloat));
-      memcpy(rkE+n*Nrk    ,_rkE,    Nrk*sizeof(dfloat));
+      h_rkX.copyFrom(_rkX,    Nrk,n*Nrk    );
+      h_rkA.copyFrom(_rkA,Nrk*Nrk,n*Nrk*Nrk);
+      h_rkE.copyFrom(_rkE,    Nrk,n*Nrk    );
     }
 
     // move data to platform
-    // o_rkX.copyFrom(rkX, "async: true");
-    // o_rkA.copyFrom(rkA, "async: true");
-    // o_rkE.copyFrom(rkE, "async: true");
-    o_rkX.copyFrom(rkX);
-    o_rkA.copyFrom(rkA);
-    o_rkE.copyFrom(rkE);
+    // o_rkX.copyFrom(rkX, properties_t("async", true));
+    // o_rkA.copyFrom(rkA, properties_t("async", true));
+    // o_rkE.copyFrom(rkE, properties_t("async", true));
+    h_rkX.copyTo(o_rkX);
+    h_rkA.copyTo(o_rkA);
+    h_rkE.copyTo(o_rkE);
   }
 }
 
-sark4::~sark4() {
-  if (o_rkq.size()) o_rkq.free();
-  if (o_rkrhsq.size()) o_rkrhsq.free();
-  if (o_rkerr.size()) o_rkerr.free();
-  if (o_errtmp.size()) o_errtmp.free();
-  if (o_rkX.size()) o_rkX.free();
-  if (o_rkA.size()) o_rkA.free();
-  if (o_rkE.size()) o_rkE.free();
-
-  if (errtmp) free(errtmp);
-  if (lambda) free(lambda);
-  if (rkC) free(rkC);
-
-  if (h_rkX.size()) h_rkX.free();
-  if (h_rkA.size()) h_rkA.free();
-  if (h_rkE.size()) h_rkE.free();
-
-  rkUpdateKernel.free();
-  rkStageKernel.free();
-  rkErrorEstimateKernel.free();
-}
 
 /**************************************************/
 /* PML version                                    */
@@ -453,28 +431,28 @@ sark4::~sark4() {
 
 sark4_pml::sark4_pml(dlong _Nelements, dlong _NpmlElements, dlong _NhaloElements,
             int _Np, int _Nfields, int _Npmlfields,
-            dfloat *_lambda, solver_t& _solver, MPI_Comm _comm):
-  sark4(_Nelements, _NhaloElements, _Np, _Nfields, _lambda, _solver, _comm),
+            memory<dfloat> _lambda,
+            platform_t& _platform, comm_t _comm):
+  sark4(_Nelements, _NhaloElements, _Np, _Nfields, _lambda, _platform, _comm),
   Npml(_Npmlfields*_Np*_NpmlElements) {
 
   if (Npml) {
-    platform_t &platform = solver.platform;
-
-    dfloat *pmlq = (dfloat *) calloc(Npml,sizeof(dfloat));
-    o_pmlq   = platform.malloc(Npml*sizeof(dfloat), pmlq);
-    free(pmlq);
+    memory<dfloat> pmlq(Npml,0.0);
+    o_pmlq = platform.malloc<dfloat>(pmlq);
 
-    o_rkpmlq    = platform.malloc(Npml*sizeof(dfloat));
-    o_rhspmlq   = platform.malloc(Npml*sizeof(dfloat));
-    o_rkrhspmlq = platform.malloc(Npml*Nrk*sizeof(dfloat));
+    o_rkpmlq    = platform.malloc<dfloat>(Npml);
+    o_rhspmlq   = platform.malloc<dfloat>(Npml);
+    o_rkrhspmlq = platform.malloc<dfloat>(Npml*Nrk);
 
-    o_savepmlq   = platform.malloc(Npml*sizeof(dfloat));
+    o_savepmlq   = platform.malloc<dfloat>(Npml);
 
     //copy base occa properties from solver
-    occa::properties kernelInfo = platform.props;
+    properties_t kernelInfo = platform.props();
+
+    const int blocksize=256;
 
     //add defines
-    kernelInfo["defines/" "p_blockSize"] = (int)BLOCKSIZE;
+    kernelInfo["defines/" "p_blockSize"] = (int)blocksize;
     kernelInfo["defines/" "p_Nrk"]     = (int)Nrk;
     kernelInfo["defines/" "p_Np"]      = (int)Np;
     kernelInfo["defines/" "p_Nfields"] = (int)Nfields;
@@ -490,7 +468,7 @@ sark4_pml::sark4_pml(dlong _Nelements, dlong _NpmlElements, dlong _NhaloElements
                                       kernelInfo);
 
     // Semi-Analytic Runge Kutta - order (3) 4 with PID timestep control
-    pmlrkA = (dfloat*) malloc(Nrk*Nrk*sizeof(dfloat));
+    pmlrkA.malloc(Nrk*Nrk);
 
     dfloat _pmlrkA[Nrk*Nrk]
                     = {      0.0,      0.0,       0.0,      0.0,       0.0,
@@ -498,31 +476,31 @@ sark4_pml::sark4_pml(dlong _Nelements, dlong _NpmlElements, dlong _NhaloElements
                              0.0,      0.5,       0.0,      0.0,       0.0,
                              0.0,      0.0,       1.0,      0.0,       0.0,
                          1.0/6.0,  1.0/3.0,   1.0/3.0,   1.0/6.0,      0.0};
-    memcpy(pmlrkA, _pmlrkA, Nrk*Nrk*sizeof(dfloat));
+    pmlrkA.copyFrom(_pmlrkA);
 
-    o_pmlrkA = platform.malloc(Nrk*Nrk*sizeof(dfloat), pmlrkA);
+    o_pmlrkA = platform.malloc<dfloat>(pmlrkA);
   }
 }
 
-void sark4_pml::Backup(occa::memory &o_Q) {
-  o_saveq.copyFrom(o_Q, N*sizeof(dfloat));
+void sark4_pml::Backup(deviceMemory<dfloat> &o_Q) {
+  o_saveq.copyFrom(o_Q, N);
   if (Npml)
-    o_savepmlq.copyFrom(o_rkpmlq, Npml*sizeof(dfloat));
+    o_savepmlq.copyFrom(o_rkpmlq, Npml);
 }
 
-void sark4_pml::Restore(occa::memory &o_Q) {
-  o_saveq.copyTo(o_Q, N*sizeof(dfloat));
+void sark4_pml::Restore(deviceMemory<dfloat> &o_Q) {
+  o_saveq.copyTo(o_Q, N);
   if (Npml)
-    o_savepmlq.copyTo(o_rkpmlq, Npml*sizeof(dfloat));
+    o_savepmlq.copyTo(o_rkpmlq, Npml);
 }
 
-void sark4_pml::AcceptStep(occa::memory &o_q, occa::memory &o_rq) {
-  o_q.copyFrom(o_rq, N*sizeof(dfloat));
+void sark4_pml::AcceptStep(deviceMemory<dfloat> &o_q, deviceMemory<dfloat> &o_rq) {
+  o_q.copyFrom(o_rq, N);
   if (Npml)
-    o_pmlq.copyFrom(o_rkpmlq, Npml*sizeof(dfloat));
+    o_pmlq.copyFrom(o_rkpmlq, Npml);
 }
 
-void sark4_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt) {
+void sark4_pml::Step(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat time, dfloat _dt) {
 
   //RK step
   for(int rk=0;rk<Nrk;++rk){
@@ -581,17 +559,6 @@ void sark4_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt) {
   }
 }
 
-sark4_pml::~sark4_pml() {
-  if (o_pmlq.size()) o_pmlq.free();
-  if (o_rkpmlq.size()) o_rkpmlq.free();
-  if (o_rhspmlq.size()) o_rhspmlq.free();
-  if (o_rkrhspmlq.size()) o_rkrhspmlq.free();
-  if (o_pmlrkA.size()) o_pmlrkA.free();
-
-  if (o_savepmlq.size()) o_savepmlq.free();
-
-  rkPmlUpdateKernel.free();
-  rkPmlStageKernel.free();
-}
-
 } //namespace TimeStepper
+
+} //namespace libp
diff --git a/libs/timeStepper/timeStepperSARK5.cpp b/libs/timeStepper/timeStepperSARK5.cpp
index d3173a576..c012c09a2 100644
--- a/libs/timeStepper/timeStepperSARK5.cpp
+++ b/libs/timeStepper/timeStepperSARK5.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -29,24 +29,25 @@ SOFTWARE.
 #include "timeStepper.hpp"
 #include <complex>
 
+namespace libp {
+
 namespace TimeStepper {
 
 using std::complex;
 
 sark5::sark5(dlong _Nelements, dlong _NhaloElements,
              int _Np, int _Nfields,
-             dfloat *_lambda, solver_t& _solver, MPI_Comm _comm):
-  timeStepper_t(_Nelements, _NhaloElements, _Np, _Nfields, _solver),
-  comm(_comm),
+             memory<dfloat> _lambda,
+             platform_t& _platform, comm_t _comm):
+  timeStepperBase_t(_Nelements, _NhaloElements, _Np, _Nfields,
+                    _platform, _comm),
   Np(_Np),
   Nfields(_Nfields),
   Nelements(_Nelements),
   NhaloElements(_NhaloElements) {
 
-  platform_t &platform = solver.platform;
-
-  lambda = (dfloat *) malloc(Nfields*sizeof(dfloat));
-  memcpy(lambda, _lambda, Nfields*sizeof(dfloat));
+  lambda.malloc(Nfields);
+  lambda.copyFrom(_lambda);
 
   Nrk = 7; //number of stages
   order = 5;
@@ -55,25 +56,26 @@ sark5::sark5(dlong _Nelements, dlong _NhaloElements,
   dlong Nlocal = Nelements*Np*Nfields;
   dlong Ntotal = (Nelements+NhaloElements)*Np*Nfields;
 
-  o_rkq    = platform.malloc(Ntotal*sizeof(dfloat));
-  o_rhsq   = platform.malloc(Nlocal*sizeof(dfloat));
-  o_rkrhsq = platform.malloc(Nlocal*Nrk*sizeof(dfloat));
-  o_rkerr  = platform.malloc(Nlocal*sizeof(dfloat));
+  o_rkq    = platform.malloc<dfloat>(Ntotal);
+  o_rhsq   = platform.malloc<dfloat>(Nlocal);
+  o_rkrhsq = platform.malloc<dfloat>(Nlocal*Nrk);
+  o_rkerr  = platform.malloc<dfloat>(Nlocal);
+
+  o_saveq  = platform.malloc<dfloat>(Nlocal);
 
-  o_saveq  = platform.malloc(Nlocal*sizeof(dfloat));
+  const int blocksize=256;
 
-  Nblock = (N+BLOCKSIZE-1)/BLOCKSIZE;
-  errtmp = (dfloat*) calloc(Nblock, sizeof(dfloat));
-  o_errtmp = platform.malloc(Nblock*sizeof(dfloat));
+  Nblock = (N+blocksize-1)/blocksize;
+  h_errtmp = platform.hostMalloc<dfloat>(Nblock);
+  o_errtmp = platform.malloc<dfloat>(Nblock);
 
-  hlong gNlocal = Nlocal;
-  hlong gNtotal;
-  MPI_Allreduce(&gNlocal, &gNtotal, 1, MPI_HLONG, MPI_SUM, comm);
+  hlong gNtotal = Nlocal;
+  comm.Allreduce(gNtotal);
 
-  occa::properties kernelInfo = platform.props; //copy base occa properties from solver
+  properties_t kernelInfo = platform.props(); //copy base occa properties from solver
 
   //add defines
-  kernelInfo["defines/" "p_blockSize"] = (int)BLOCKSIZE;
+  kernelInfo["defines/" "p_blockSize"] = (int)blocksize;
   kernelInfo["defines/" "p_Nrk"]     = (int)Nrk;
   kernelInfo["defines/" "p_Np"]      = (int)Np;
   kernelInfo["defines/" "p_Nfields"] = (int)Nfields;
@@ -95,16 +97,16 @@ sark5::sark5(dlong _Nelements, dlong _NhaloElements,
 
   // Semi-Analytic Runge Kutta - order (4) 5 with PID timestep control
   dfloat _rkC[Nrk] = {0.0, 0.25, 0.25, 0.5, 0.75, 1.0, 1.0};
-  rkC = (dfloat*) calloc(Nrk, sizeof(dfloat));
-  memcpy(rkC, _rkC, Nrk*sizeof(dfloat));
+  rkC.malloc(Nrk);
+  rkC.copyFrom(_rkC);
 
-  rkX = (dfloat*) platform.hostMalloc(Nfields*Nrk*    sizeof(dfloat), NULL, h_rkX);
-  rkA = (dfloat*) platform.hostMalloc(Nfields*Nrk*Nrk*sizeof(dfloat), NULL, h_rkA);
-  rkE = (dfloat*) platform.hostMalloc(Nfields*Nrk*    sizeof(dfloat), NULL, h_rkE);
+  h_rkX = platform.hostMalloc<dfloat>(Nfields*Nrk);
+  h_rkA = platform.hostMalloc<dfloat>(Nfields*Nrk*Nrk);
+  h_rkE = platform.hostMalloc<dfloat>(Nfields*Nrk);
 
-  o_rkX = platform.malloc(Nfields*Nrk*    sizeof(dfloat));
-  o_rkA = platform.malloc(Nfields*Nrk*Nrk*sizeof(dfloat));
-  o_rkE = platform.malloc(Nfields*Nrk*    sizeof(dfloat));
+  o_rkX = platform.malloc<dfloat>(Nfields*Nrk);
+  o_rkA = platform.malloc<dfloat>(Nfields*Nrk*Nrk);
+  o_rkE = platform.malloc<dfloat>(Nfields*Nrk);
 
   dtMIN = 1E-9; //minumum allowed timestep
   ATOL = 1E-5;  //absolute error tolerance
@@ -123,16 +125,15 @@ sark5::sark5(dlong _Nelements, dlong _NhaloElements,
   sqrtinvNtotal = 1.0/sqrt(gNtotal);
 }
 
-void sark5::Run(occa::memory &o_q, dfloat start, dfloat end) {
+void sark5::Run(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat start, dfloat end) {
 
   dfloat time = start;
 
-  int rank;
-  MPI_Comm_rank(comm, &rank);
+  int rank = comm.rank();
 
   solver.Report(time,0);
 
-  dfloat outputInterval;
+  dfloat outputInterval=0.0;
   solver.settings.getSetting("OUTPUT INTERVAL", outputInterval);
 
   dfloat outputTime = time + outputInterval;
@@ -144,23 +145,17 @@ void sark5::Run(occa::memory &o_q, dfloat start, dfloat end) {
 
   while (time < end) {
 
-    if (dt<dtMIN){
-      stringstream ss;
-      ss << "Time step became too small at time step = " << tstep;
-      LIBP_ABORT(ss.str());
-    }
-    if (std::isnan(dt)) {
-      stringstream ss;
-      ss << "Solution became unstable at time step = " << tstep;
-      LIBP_ABORT(ss.str());
-    }
+    LIBP_ABORT("Time step became too small at time step = " << tstep,
+               dt<dtMIN);
+    LIBP_ABORT("Solution became unstable at time step = " << tstep,
+               std::isnan(dt));
 
     //check for final timestep
     if (time+dt > end){
       dt = end-time;
     }
 
-    Step(o_q, time, dt);
+    Step(solver, o_q, time, dt);
 
     // compute Dopri estimator
     dfloat err = Estimater(o_q);
@@ -169,7 +164,7 @@ void sark5::Run(occa::memory &o_q, dfloat start, dfloat end) {
     dfloat fac1 = pow(err,exp1);
     dfloat fac = fac1/pow(facold,beta);
 
-    fac = mymax(invfactor2, mymin(invfactor1,fac/safe));
+    fac = std::max(invfactor2, std::min(invfactor1,fac/safe));
     dfloat dtnew = dt/fac;
 
     if (err<1.0) { //dt is accepted
@@ -191,7 +186,7 @@ void sark5::Run(occa::memory &o_q, dfloat start, dfloat end) {
         UpdateCoefficients();
 
         // time step to output
-        Step(o_q, time, dt);
+        Step(solver, o_q, time, dt);
 
         // shift for output
         o_rkq.copyTo(o_q);
@@ -218,14 +213,15 @@ void sark5::Run(occa::memory &o_q, dfloat start, dfloat end) {
       time += dt;
       while (time>outputTime) outputTime+= outputInterval; //catch up next output in case dt>outputInterval
 
-      facold = mymax(err,1E-4); // hard coded factor ?
+      constexpr dfloat errMax = 1.0e-4;  // hard coded factor ?
+      facold = std::max(err,errMax);
 
       // if (!rank)
       //   printf("\r time = %g (%d), dt = %g accepted                      ", time, allStep,  dt);
 
       tstep++;
     } else {
-      dtnew = dt/(mymax(invfactor1,fac1/safe));
+      dtnew = dt/(std::max(invfactor1,fac1/safe));
 
       // if (!rank)
       //   printf("\r time = %g (%d), dt = %g rejected, trying %g", time, allStep, dt, dtnew);
@@ -244,19 +240,19 @@ void sark5::Run(occa::memory &o_q, dfloat start, dfloat end) {
     printf("%d accepted steps and %d total steps\n", tstep, allStep);
 }
 
-void sark5::Backup(occa::memory &o_Q) {
-  o_saveq.copyFrom(o_Q, N*sizeof(dfloat));
+void sark5::Backup(deviceMemory<dfloat> &o_Q) {
+  o_saveq.copyFrom(o_Q, N);
 }
 
-void sark5::Restore(occa::memory &o_Q) {
-  o_saveq.copyTo(o_Q, N*sizeof(dfloat));
+void sark5::Restore(deviceMemory<dfloat> &o_Q) {
+  o_saveq.copyTo(o_Q, N);
 }
 
-void sark5::AcceptStep(occa::memory &o_q, occa::memory &o_rq) {
-  o_q.copyFrom(o_rq, N*sizeof(dfloat));
+void sark5::AcceptStep(deviceMemory<dfloat> &o_q, deviceMemory<dfloat> &o_rq) {
+  o_q.copyFrom(o_rq, N);
 }
 
-void sark5::Step(occa::memory &o_q, dfloat time, dfloat _dt) {
+void sark5::Step(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat time, dfloat _dt) {
 
   //RK step
   for(int rk=0;rk<Nrk;++rk){
@@ -297,7 +293,7 @@ void sark5::Step(occa::memory &o_q, dfloat time, dfloat _dt) {
   }
 }
 
-dfloat sark5::Estimater(occa::memory& o_q){
+dfloat sark5::Estimater(deviceMemory<dfloat>& o_q){
 
   //Error estimation
   //E. HAIRER, S.P. NORSETT AND G. WANNER, SOLVING ORDINARY
@@ -310,13 +306,12 @@ dfloat sark5::Estimater(occa::memory& o_q){
                         o_rkerr,
                         o_errtmp);
 
-  o_errtmp.copyTo(errtmp);
-  dfloat localerr = 0;
+  h_errtmp.copyFrom(o_errtmp);
   dfloat err = 0;
   for(dlong n=0;n<Nblock;++n){
-    localerr += errtmp[n];
+    err += h_errtmp[n];
   }
-  MPI_Allreduce(&localerr, &err, 1, MPI_DFLOAT, MPI_SUM, comm);
+  comm.Allreduce(err);
 
   err = sqrt(err)*sqrtinvNtotal;
 
@@ -350,9 +345,9 @@ void sark5::UpdateCoefficients() {
 
       dfloat _rkE[Nrk]=  {-4./45., 0, 16./45., -8./15., 16./45., -4./45., 0.};
 
-      memcpy(rkX+n*Nrk    ,_rkX,    Nrk*sizeof(dfloat));
-      memcpy(rkA+n*Nrk*Nrk,_rkA,Nrk*Nrk*sizeof(dfloat));
-      memcpy(rkE+n*Nrk    ,_rkE,    Nrk*sizeof(dfloat));
+      h_rkX.copyFrom(_rkX,    Nrk,n*Nrk    );
+      h_rkA.copyFrom(_rkA,Nrk*Nrk,n*Nrk*Nrk);
+      h_rkE.copyFrom(_rkE,    Nrk,n*Nrk    );
 
     } else {
 
@@ -455,7 +450,13 @@ void sark5::UpdateCoefficients() {
       dfloat b4=real(cb4)/ (double) Nr;
       dfloat b6=real(cb6)/ (double) Nr;
 
-      dfloat _rkX[Nrk]  = {1.0, exp(0.25*alpha), exp(0.25*alpha),exp(0.5*alpha), exp(0.75*alpha), exp(alpha), exp(alpha)};
+      dfloat _rkX[Nrk]  = {1.0,
+                           std::exp(dfloat(0.25)*alpha),
+                           std::exp(dfloat(0.25)*alpha),
+                           std::exp(dfloat(0.5)*alpha),
+                           std::exp(dfloat(0.75)*alpha),
+                           std::exp(alpha),
+                           std::exp(alpha)};
       dfloat _rkA[Nrk*Nrk]={   0,   0,     0,     0,     0,    0,   0,
                              a21,   0,     0,     0,     0,    0,   0,
                              a31, a32,     0,     0,     0,    0,   0,
@@ -466,70 +467,48 @@ void sark5::UpdateCoefficients() {
 
       dfloat _rkE[Nrk]= {b1, 0, b3, b4, a75, b6, 0};
 
-      memcpy(rkX+n*Nrk    ,_rkX,    Nrk*sizeof(dfloat));
-      memcpy(rkA+n*Nrk*Nrk,_rkA,Nrk*Nrk*sizeof(dfloat));
-      memcpy(rkE+n*Nrk    ,_rkE,    Nrk*sizeof(dfloat));
+      h_rkX.copyFrom(_rkX,    Nrk,n*Nrk    );
+      h_rkA.copyFrom(_rkA,Nrk*Nrk,n*Nrk*Nrk);
+      h_rkE.copyFrom(_rkE,    Nrk,n*Nrk    );
     }
 
     // move data to platform
-    // o_rkX.copyFrom(rkX, "async: true");
-    // o_rkA.copyFrom(rkA, "async: true");
-    // o_rkE.copyFrom(rkE, "async: true");
-    o_rkX.copyFrom(rkX);
-    o_rkA.copyFrom(rkA);
-    o_rkE.copyFrom(rkE);
+    // o_rkX.copyFrom(rkX, properties_t("async", true));
+    // o_rkA.copyFrom(rkA, properties_t("async", true));
+    // o_rkE.copyFrom(rkE, properties_t("async", true));
+    h_rkX.copyTo(o_rkX);
+    h_rkA.copyTo(o_rkA);
+    h_rkE.copyTo(o_rkE);
   }
 }
 
-sark5::~sark5() {
-  if (o_rkq.size()) o_rkq.free();
-  if (o_rkrhsq.size()) o_rkrhsq.free();
-  if (o_rkerr.size()) o_rkerr.free();
-  if (o_errtmp.size()) o_errtmp.free();
-  if (o_rkX.size()) o_rkX.free();
-  if (o_rkA.size()) o_rkA.free();
-  if (o_rkE.size()) o_rkE.free();
-
-  if (errtmp) free(errtmp);
-  if (lambda) free(lambda);
-  if (rkC) free(rkC);
-
-  if (h_rkX.size()) h_rkX.free();
-  if (h_rkA.size()) h_rkA.free();
-  if (h_rkE.size()) h_rkE.free();
-
-  rkUpdateKernel.free();
-  rkStageKernel.free();
-  rkErrorEstimateKernel.free();
-}
-
 /**************************************************/
 /* PML version                                    */
 /**************************************************/
 
 sark5_pml::sark5_pml(dlong _Nelements, dlong _NpmlElements, dlong _NhaloElements,
             int _Np, int _Nfields, int _Npmlfields,
-            dfloat *_lambda, solver_t& _solver, MPI_Comm _comm):
-  sark5(_Nelements, _NhaloElements, _Np, _Nfields, _lambda, _solver, _comm),
+            memory<dfloat> _lambda,
+            platform_t& _platform, comm_t _comm):
+  sark5(_Nelements, _NhaloElements, _Np, _Nfields, _lambda, _platform, _comm),
   Npml(_Npmlfields*_Np*_NpmlElements) {
 
   if (Npml) {
-    platform_t &platform = solver.platform;
+    memory<dfloat> pmlq(Npml,0.0);
+    o_pmlq   = platform.malloc<dfloat>(pmlq);
 
-    dfloat *pmlq = (dfloat *) calloc(Npml,sizeof(dfloat));
-    o_pmlq   = platform.malloc(Npml*sizeof(dfloat), pmlq);
-    free(pmlq);
+    o_rkpmlq    = platform.malloc<dfloat>(Npml);
+    o_rhspmlq   = platform.malloc<dfloat>(Npml);
+    o_rkrhspmlq = platform.malloc<dfloat>(Npml*Nrk);
 
-    o_rkpmlq    = platform.malloc(Npml*sizeof(dfloat));
-    o_rhspmlq   = platform.malloc(Npml*sizeof(dfloat));
-    o_rkrhspmlq = platform.malloc(Npml*Nrk*sizeof(dfloat));
+    o_savepmlq   = platform.malloc<dfloat>(Npml);
 
-    o_savepmlq   = platform.malloc(Npml*sizeof(dfloat));
+    properties_t kernelInfo = platform.props(); //copy base occa properties from solver
 
-    occa::properties kernelInfo = platform.props; //copy base occa properties from solver
+    const int blocksize=256;
 
     //add defines
-    kernelInfo["defines/" "p_blockSize"] = (int)BLOCKSIZE;
+    kernelInfo["defines/" "p_blockSize"] = (int)blocksize;
     kernelInfo["defines/" "p_Nrk"]     = (int)Nrk;
     kernelInfo["defines/" "p_Np"]      = (int)Np;
     kernelInfo["defines/" "p_Nfields"] = (int)Nfields;
@@ -545,7 +524,7 @@ sark5_pml::sark5_pml(dlong _Nelements, dlong _NpmlElements, dlong _NhaloElements
                                       kernelInfo);
 
     // Semi-Analytic Runge Kutta - order (3) 4 with PID timestep control
-    pmlrkA = (dfloat*) malloc(Nrk*Nrk*sizeof(dfloat));
+    pmlrkA.malloc(Nrk*Nrk);
 
     dfloat _pmlrkA[Nrk*Nrk] =  {     0,      0,       0,       0,       0,      0,   0,
                                    1/4,      0,       0,       0,       0,      0,   0,
@@ -555,31 +534,31 @@ sark5_pml::sark5_pml(dlong _Nelements, dlong _NpmlElements, dlong _NhaloElements
                                 -3./7.,  8./7.,   6./7., -12./7.,   8./7.,      0,   0,
                                 7./90.,     0., 16./45.,  2./15., 16./45., 7./90.,   0};
 
-    memcpy(pmlrkA, _pmlrkA, Nrk*Nrk*sizeof(dfloat));
+    pmlrkA.copyFrom(_pmlrkA);
 
-    o_pmlrkA = platform.malloc(Nrk*Nrk*sizeof(dfloat), pmlrkA);
+    o_pmlrkA = platform.malloc<dfloat>(pmlrkA);
   }
 }
 
-void sark5_pml::Backup(occa::memory &o_Q) {
-  o_saveq.copyFrom(o_Q, N*sizeof(dfloat));
+void sark5_pml::Backup(deviceMemory<dfloat> &o_Q) {
+  o_saveq.copyFrom(o_Q, N);
   if (Npml)
-    o_savepmlq.copyFrom(o_rkpmlq, Npml*sizeof(dfloat));
+    o_savepmlq.copyFrom(o_rkpmlq, Npml);
 }
 
-void sark5_pml::Restore(occa::memory &o_Q) {
-  o_saveq.copyTo(o_Q, N*sizeof(dfloat));
+void sark5_pml::Restore(deviceMemory<dfloat> &o_Q) {
+  o_saveq.copyTo(o_Q, N);
   if (Npml)
-    o_savepmlq.copyTo(o_rkpmlq, Npml*sizeof(dfloat));
+    o_savepmlq.copyTo(o_rkpmlq, Npml);
 }
 
-void sark5_pml::AcceptStep(occa::memory &o_q, occa::memory &o_rq) {
-  o_q.copyFrom(o_rq, N*sizeof(dfloat));
+void sark5_pml::AcceptStep(deviceMemory<dfloat> &o_q, deviceMemory<dfloat> &o_rq) {
+  o_q.copyFrom(o_rq, N);
   if (Npml)
-    o_pmlq.copyFrom(o_rkpmlq, Npml*sizeof(dfloat));
+    o_pmlq.copyFrom(o_rkpmlq, Npml);
 }
 
-void sark5_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt) {
+void sark5_pml::Step(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat time, dfloat _dt) {
 
   //RK step
   for(int rk=0;rk<Nrk;++rk){
@@ -638,17 +617,6 @@ void sark5_pml::Step(occa::memory &o_q, dfloat time, dfloat _dt) {
   }
 }
 
-sark5_pml::~sark5_pml() {
-  if (o_pmlq.size()) o_pmlq.free();
-  if (o_rkpmlq.size()) o_rkpmlq.free();
-  if (o_rhspmlq.size()) o_rhspmlq.free();
-  if (o_rkrhspmlq.size()) o_rkrhspmlq.free();
-  if (o_pmlrkA.size()) o_pmlrkA.free();
-
-  if (o_savepmlq.size()) o_savepmlq.free();
-
-  rkPmlUpdateKernel.free();
-  rkPmlStageKernel.free();
-}
-
 } //namespace TimeStepper
+
+} //namespace libp
diff --git a/libs/timeStepper/timeStepperSSBDF3.cpp b/libs/timeStepper/timeStepperSSBDF3.cpp
index d48021fb2..a13d89da3 100644
--- a/libs/timeStepper/timeStepperSSBDF3.cpp
+++ b/libs/timeStepper/timeStepperSSBDF3.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,25 +27,29 @@ SOFTWARE.
 #include "core.hpp"
 #include "timeStepper.hpp"
 
+namespace libp {
+
 namespace TimeStepper {
 
 /* Backward Difference Formula, order 3, with subcycling */
 ssbdf3::ssbdf3(dlong Nelements, dlong NhaloElements,
-                 int Np, int Nfields, solver_t& _solver):
-  timeStepper_t(Nelements, NhaloElements, Np, Nfields, _solver) {
-
-  platform_t &platform = solver.platform;
+                 int Np, int Nfields,
+                 platform_t& _platform, comm_t _comm):
+  timeStepperBase_t(Nelements, NhaloElements, Np, Nfields,
+                    _platform, _comm) {
 
   Nstages = 3;
   shiftIndex = 0;
 
-  o_qn   = platform.malloc(Nstages*N*sizeof(dfloat)); //q history
-  o_qhat = platform.malloc(Nstages*N*sizeof(dfloat)); //F(q) history (explicit part)
-  o_rhs  = platform.malloc(N*sizeof(dfloat)); //rhs storage
+  o_qn   = platform.malloc<dfloat>(Nstages*N); //q history
+  o_qhat = platform.malloc<dfloat>(Nstages*N); //F(q) history (explicit part)
+  o_rhs  = platform.malloc<dfloat>(N); //rhs storage
 
-  occa::properties kernelInfo = platform.props; //copy base occa properties from solver
+  properties_t kernelInfo = platform.props(); //copy base occa properties from solver
 
-  kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE;
+  const int blocksize=256;
+
+  kernelInfo["defines/" "p_blockSize"] = blocksize;
   kernelInfo["defines/" "p_Nstages"] = Nstages;
 
   rhsKernel = platform.buildKernel(TIMESTEPPER_DIR "/okl/"
@@ -59,23 +63,23 @@ ssbdf3::ssbdf3(dlong Nelements, dlong NhaloElements,
                          3./2.,    2., -1./2.,    0.,
                         11./6.,    3., -3./2., 1./3.};
 
-  ssbdf_b = (dfloat*) calloc(Nstages*(Nstages+1), sizeof(dfloat));
-  memcpy(ssbdf_b, _b, Nstages*(Nstages+1)*sizeof(dfloat));
+  ssbdf_b.malloc(Nstages*(Nstages+1));
+  ssbdf_b.copyFrom(_b);
 
-  o_ssbdf_b = platform.malloc(Nstages*(Nstages+1)*sizeof(dfloat), ssbdf_b);
+  o_ssbdf_b = platform.malloc<dfloat>(ssbdf_b);
 }
 
-dfloat ssbdf3::getGamma() {
-  return *(ssbdf_b + (Nstages-1)*(Nstages+1)); //first entry of last row of B
+dfloat ssbdf3::GetGamma() {
+  return ssbdf_b[(Nstages-1)*(Nstages+1)]; //first entry of last row of B
 }
 
-void ssbdf3::Run(occa::memory &o_q, dfloat start, dfloat end) {
+void ssbdf3::Run(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat start, dfloat end) {
 
   dfloat time = start;
 
   solver.Report(time,0);
 
-  dfloat outputInterval;
+  dfloat outputInterval=0.0;
   solver.settings.getSetting("OUTPUT INTERVAL", outputInterval);
 
   dfloat outputTime = time + outputInterval;
@@ -83,7 +87,7 @@ void ssbdf3::Run(occa::memory &o_q, dfloat start, dfloat end) {
   int tstep=0;
   int order=0;
   while (time < end) {
-    Step(o_q, time, dt, order);
+    Step(solver, o_q, time, dt, order);
     time += dt;
     tstep++;
     if (order<Nstages-1) order++;
@@ -96,15 +100,15 @@ void ssbdf3::Run(occa::memory &o_q, dfloat start, dfloat end) {
   }
 }
 
-void ssbdf3::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) {
+void ssbdf3::Step(solver_t& solver, deviceMemory<dfloat> &o_q, dfloat time, dfloat _dt, int order) {
 
   //BDF coefficients at current order
-  occa::memory o_B = o_ssbdf_b + order*(Nstages+1)*sizeof(dfloat);
-  dfloat *B = ssbdf_b + order*(Nstages+1);
+  deviceMemory<dfloat> o_B = o_ssbdf_b + order*(Nstages+1);
+  memory<dfloat> B = ssbdf_b + order*(Nstages+1);
 
   //put current q into history
-  occa::memory o_qn0 = o_qn + shiftIndex*N*sizeof(dfloat);
-  o_qn0.copyFrom(o_q, N*sizeof(dfloat));
+  deviceMemory<dfloat> o_qn0 = o_qn + shiftIndex*N;
+  o_qn0.copyFrom(o_q, N);
 
   // Compute qhat = sum_i=1^s B_i qhat(t_n+1-i) by
   // where qhat(t) is the Lagrangian state of q
@@ -129,15 +133,6 @@ void ssbdf3::Step(occa::memory &o_q, dfloat time, dfloat _dt, int order) {
   shiftIndex = (shiftIndex+Nstages-1)%Nstages;
 }
 
-ssbdf3::~ssbdf3() {
-  if (o_rhs.size()) o_rhs.free();
-  if (o_qn.size()) o_qn.free();
-  if (o_qhat.size()) o_qhat.free();
-  if (o_ssbdf_b.size()) o_ssbdf_b.free();
-
-  if (ssbdf_b) free(ssbdf_b);
-
-  rhsKernel.free();
-}
-
 } //namespace TimeStepper
+
+} //namespace libp
diff --git a/make.top b/make.top
index 322ec2399..63aad7879 100644
--- a/make.top
+++ b/make.top
@@ -2,7 +2,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
@@ -27,57 +27,51 @@
 #can be GNU or INTEL
 LIBP_ARCH=GNU
 
-#absolute path to libparanumal
+#absolute path to LIBP
 export LIBP_DIR:=$(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST)))))
 
 export LIBP_INCLUDE_DIR=${LIBP_DIR}/include
-export LIBP_TPL_DIR=${LIBP_DIR}/3rdParty
 export LIBP_LIBS_DIR=${LIBP_DIR}/libs
 export LIBP_TEST_DIR=${LIBP_DIR}/test
 
-#paths to installed blas and lapack libraries
-export LIBP_BLAS_DIR  =/usr/lib/x86_64-linux-gnu
-export LIBP_LAPACK_DIR=/usr/lib/x86_64-linux-gnu
-export LIBP_BLAS_LIB  =-L${LIBP_BLAS_DIR} -lblas
-export LIBP_LAPACK_LIB=-L${LIBP_LAPACK_DIR} -llapack
+export LIBP_BLAS_DIR?=/usr/lib/x86_64-linux-gnu/openblas-serial
+export LIBP_BLAS_LIB=-L${LIBP_BLAS_DIR} -lopenblas
 
 #include OCCA
 export OCCA_DIR=${LIBP_DIR}/occa
 
 #compilers to use for C/C++
-export LIBP_MPICC = mpicc
-export LIBP_MPICXX= mpic++
-export LIBP_LD    = mpic++
+export LIBP_CC = mpicc
+export LIBP_CXX= mpic++
+export LIBP_LD = mpic++
 
-export LIBP_CFLAGS=-O2 -fopenmp -g -Wall -Wshadow -Wno-unused-function
-export LIBP_CXXFLAGS=-O2 -fopenmp -g -Wall -Wshadow -Wno-unused-function -std=c++11
-
-ifeq (1,${LIBP_COVERAGE})
-export LIBP_CXXFLAGS+= --coverage -fprofile-abs-path
-endif
-
-export LIBP_MPICFLAGS=$(LIBP_CFLAGS)
-export LIBP_MPICXXFLAGS=$(LIBP_CXXFLAGS)
+export LIBP_INCLUDES=-I${LIBP_INCLUDE_DIR} -I${OCCA_DIR}/include
+export LIBP_LIBS= -Wl,-rpath,$(LIBP_BLAS_DIR) ${LIBP_BLAS_LIB}  \
+                  -Wl,-rpath,$(OCCA_DIR)/lib -L$(OCCA_DIR)/lib -locca
 
-export LIBP_LIBS=${LIBP_BLAS_LIB} \
-                 ${LIBP_LAPACK_LIB} \
-                 -Wl,-rpath=$(OCCA_DIR)/lib -L$(OCCA_DIR)/lib -locca
 
-export LIBP_DEFINES=
+ifneq (,${debug})
+  export LIBP_CFLAGS=-O0 -g -Wall -Wshadow -Wno-unused-function -Wno-unknown-pragmas
+  export LIBP_CXXFLAGS=-O0 -g -Wall -Wshadow -Wno-unused-function -Wno-unknown-pragmas -std=c++17
+  export LIBP_DEFINES=-DLIBP_DEBUG
+else
+  export LIBP_CFLAGS=-fopenmp -O3 -Wall -Wshadow -Wno-unused-function
+  export LIBP_CXXFLAGS=-fopenmp -O3 -Wall -Wshadow -Wno-unused-function -std=c++17
+  export LIBP_DEFINES=
 
-export LIBP_INCLUDES=-I${LIBP_INCLUDE_DIR} -I${OCCA_DIR}/include
+  ifeq (GNU,${LIBP_ARCH})
+    LIBP_CFLAGS+= -mavx2 -ftree-vectorize -march=native -DGLIBCXX_PARALLEL
+    LIBP_CXXFLAGS+= -mavx2 -ftree-vectorize -march=native -DGLIBCXX_PARALLEL
+  else ifeq (INTEL,${LIBP_ARCH})
+    LIBP_CFLAGS+=-funroll-loops -xHost
+    LIBP_CXXFLAGS+=-funroll-loops -xHost
+  else
+    $(error unknown arch for [LIBP_ARCH] specified)
+  endif
+endif
 
-ifeq (GNU,${LIBP_ARCH})
-#   LIBP_CXXFLAGS+= -mavx2 -ftree-vectorize -march=native
-  #-funroll-all-loops
-  #LIBP_LIBS+=-L/opt/apps/gcc5_2/atlas/3.10.2/lib/ -llapack -latlas  -lf77blas -lcblas -lptcblas -lptf77blas -lsatlas  -lgfortran  -L../../../libxsmm/lib -lxsmm -ldl
-else ifeq (INTEL,${LIBP_ARCH})
-  LIBP_CXXFLAGS+= -funroll-loops -xHost
-  LIBP_LIBS+=-L/opt/apps/intel15_3/mkl/11.2.3/lib/intel64  -lmkl_rt
-  #  -fopt-info-vec-missed  -fopt-info
-  #-fopt-info
-else
-  $(error unknown arch for [LIBP_ARCH] specified)
+ifeq (1,${LIBP_COVERAGE})
+  export LIBP_CXXFLAGS+= --coverage -fprofile-abs-path
 endif
 
 export OBJ_COLOR = \033[0;36m
diff --git a/makefile b/makefile
index 5b2bc1429..c2befaa55 100644
--- a/makefile
+++ b/makefile
@@ -2,7 +2,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
diff --git a/occa b/occa
index 327582bfb..11552d0dc 160000
--- a/occa
+++ b/occa
@@ -1 +1 @@
-Subproject commit 327582bfb6667defb008d743961fa38053960214
+Subproject commit 11552d0dc02fb9880f61f46e46115b6a50dada32
diff --git a/solvers/acoustics/acoustics.hpp b/solvers/acoustics/acoustics.hpp
index a264ee89b..26bd8043a 100644
--- a/solvers/acoustics/acoustics.hpp
+++ b/solvers/acoustics/acoustics.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -36,55 +36,56 @@ SOFTWARE.
 
 #define DACOUSTICS LIBP_DIR"/solvers/acoustics/"
 
+using namespace libp;
+
 class acousticsSettings_t: public settings_t {
 public:
-  acousticsSettings_t(MPI_Comm& _comm);
+  acousticsSettings_t(comm_t _comm);
   void report();
   void parseFromFile(platformSettings_t& platformSettings,
                      meshSettings_t& meshSettings,
-                     const string filename);
+                     const std::string filename);
 };
 
 class acoustics_t: public solver_t {
 public:
-  mesh_t &mesh;
+  mesh_t mesh;
 
   int Nfields;
 
-  TimeStepper::timeStepper_t* timeStepper;
+  timeStepper_t timeStepper;
 
-  halo_t* traceHalo;
+  ogs::halo_t traceHalo;
 
-  dfloat *q;
-  occa::memory o_q;
+  memory<dfloat> q;
+  deviceMemory<dfloat> o_q;
 
-  occa::memory o_Mq;
+  deviceMemory<dfloat> o_Mq;
 
-  occa::kernel volumeKernel;
-  occa::kernel surfaceKernel;
+  kernel_t volumeKernel;
+  kernel_t surfaceKernel;
 
-  occa::kernel initialConditionKernel;
+  kernel_t initialConditionKernel;
 
-  acoustics_t() = delete;
+  acoustics_t() = default;
   acoustics_t(platform_t &_platform, mesh_t &_mesh,
-              acousticsSettings_t& _settings):
-    solver_t(_platform, _settings), mesh(_mesh) {}
-
-  ~acoustics_t();
+              acousticsSettings_t& _settings) {
+    Setup(_platform, _mesh, _settings);
+  }
 
   //setup
-  static acoustics_t& Setup(platform_t& platform, mesh_t& mesh,
-                            acousticsSettings_t& settings);
+  void Setup(platform_t& _platform, mesh_t& _mesh,
+             acousticsSettings_t& _settings);
 
   void Run();
 
   void Report(dfloat time, int tstep);
 
-  void PlotFields(dfloat* Q, char *fileName);
+  void PlotFields(memory<dfloat> Q, const std::string fileName);
 
-  void rhsf(occa::memory& o_q, occa::memory& o_rhs, const dfloat time);
+  void rhsf(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_rhs, const dfloat time);
 
   dfloat MaxWaveSpeed();
 };
 
-#endif
\ No newline at end of file
+#endif
diff --git a/solvers/acoustics/acousticsMain.cpp b/solvers/acoustics/acousticsMain.cpp
index b9de52308..834f55bb3 100644
--- a/solvers/acoustics/acousticsMain.cpp
+++ b/solvers/acoustics/acousticsMain.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -29,39 +29,40 @@ SOFTWARE.
 int main(int argc, char **argv){
 
   // start up MPI
-  MPI_Init(&argc, &argv);
+  Comm::Init(argc, argv);
 
-  MPI_Comm comm = MPI_COMM_WORLD;
+  LIBP_ABORT("Usage: ./acousticsMain setupfile", argc!=2);
 
-  if(argc!=2)
-    LIBP_ABORT(string("Usage: ./acousticsMain setupfile"));
+  { /*Scope so everything is destructed before MPI_Finalize */
+    comm_t comm(Comm::World().Dup());
 
-  //create default settings
-  platformSettings_t platformSettings(comm);
-  meshSettings_t meshSettings(comm);
-  acousticsSettings_t acousticsSettings(comm);
+    //create default settings
+    platformSettings_t platformSettings(comm);
+    meshSettings_t meshSettings(comm);
+    acousticsSettings_t acousticsSettings(comm);
 
-  //load settings from file
-  acousticsSettings.parseFromFile(platformSettings, meshSettings,
-                            argv[1]);
+    //load settings from file
+    acousticsSettings.parseFromFile(platformSettings, meshSettings,
+                              argv[1]);
 
-  // set up platform
-  platform_t platform(platformSettings);
+    // set up platform
+    platform_t platform(platformSettings);
 
-  platformSettings.report();
-  meshSettings.report();
-  acousticsSettings.report();
+    platformSettings.report();
+    meshSettings.report();
+    acousticsSettings.report();
 
-  // set up mesh
-  mesh_t& mesh = mesh_t::Setup(platform, meshSettings, comm);
+    // set up mesh
+    mesh_t mesh(platform, meshSettings, comm);
 
-  // set up acoustics solver
-  acoustics_t& acoustics = acoustics_t::Setup(platform, mesh, acousticsSettings);
+    // set up acoustics solver
+    acoustics_t acoustics(platform, mesh, acousticsSettings);
 
-  // run
-  acoustics.Run();
+    // run
+    acoustics.Run();
+  }
 
   // close down MPI
-  MPI_Finalize();
+  Comm::Finalize();
   return LIBP_SUCCESS;
 }
diff --git a/solvers/acoustics/data/acousticsGaussian2D.h b/solvers/acoustics/data/acousticsGaussian2D.h
index db34505dd..621a65159 100644
--- a/solvers/acoustics/data/acousticsGaussian2D.h
+++ b/solvers/acoustics/data/acousticsGaussian2D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/acoustics/data/acousticsGaussian3D.h b/solvers/acoustics/data/acousticsGaussian3D.h
index 14ce2bfe6..cedf44200 100644
--- a/solvers/acoustics/data/acousticsGaussian3D.h
+++ b/solvers/acoustics/data/acousticsGaussian3D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -50,4 +50,4 @@ SOFTWARE.
   *(u) = 0.0;                           \
   *(v) = 0.0;                           \
   *(w) = 0.0;                           \
-}
\ No newline at end of file
+}
diff --git a/solvers/acoustics/makefile b/solvers/acoustics/makefile
index d19c9e494..bf2875350 100644
--- a/solvers/acoustics/makefile
+++ b/solvers/acoustics/makefile
@@ -2,7 +2,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
@@ -77,11 +77,8 @@ include ../../make.top
 endif
 endif
 
-#gslib
-GS_DIR=${LIBP_TPL_DIR}/gslib
-
 #libraries
-ACOUSTICS_LIBP_LIBS=timeStepper mesh ogs linAlg core
+ACOUSTICS_LIBP_LIBS=timeStepper mesh parAdogs ogs linAlg core
 
 #includes
 INCLUDES=${LIBP_INCLUDES} \
@@ -92,11 +89,10 @@ DEFINES =${LIBP_DEFINES} \
          -DLIBP_DIR='"${LIBP_DIR}"'
 
 #.cpp compilation flags
-ACOUSTICS_CXXFLAGS=${LIBP_MPICXXFLAGS} ${DEFINES} ${INCLUDES}
+ACOUSTICS_CXXFLAGS=${LIBP_CXXFLAGS} ${DEFINES} ${INCLUDES}
 
 #link libraries
 LIBS=-L${LIBP_LIBS_DIR} $(addprefix -l,$(ACOUSTICS_LIBP_LIBS)) \
-     -L$(GS_DIR)/lib -lgs \
      ${LIBP_LIBS}
 
 #link flags
@@ -144,10 +140,10 @@ endif
 # rule for .cpp files
 %.o: %.cpp $(DEPS) | libp_libs
 ifneq (,${verbose})
-	$(LIBP_MPICXX) -o $*.o -c $*.cpp $(ACOUSTICS_CXXFLAGS)
+	$(LIBP_CXX) -o $*.o -c $*.cpp $(ACOUSTICS_CXXFLAGS)
 else
 	@printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n";
-	@$(LIBP_MPICXX) -o $*.o -c $*.cpp $(ACOUSTICS_CXXFLAGS)
+	@$(LIBP_CXX) -o $*.o -c $*.cpp $(ACOUSTICS_CXXFLAGS)
 endif
 
 #cleanup
@@ -158,8 +154,7 @@ clean-libs: clean
 	${MAKE} -C ${LIBP_LIBS_DIR} clean
 
 clean-kernels: clean-libs
-# 	$(shell ${OCCA_DIR}/bin/occa clear all -y)
-	rm -rf ~/.occa/
+	rm -rf ${LIBP_DIR}/.occa/
 
 realclean: clean
 	${MAKE} -C ${LIBP_LIBS_DIR} realclean
@@ -177,4 +172,4 @@ info:
 	@true
 
 test: acousticsMain
-	@${MAKE} -C $(LIBP_TEST_DIR) --no-print-directory  test-acoustics
\ No newline at end of file
+	@${MAKE} -C $(LIBP_TEST_DIR) --no-print-directory  test-acoustics
diff --git a/solvers/acoustics/okl/acousticsInitialCondition2D.okl b/solvers/acoustics/okl/acousticsInitialCondition2D.okl
index 71255c37d..7fcfde446 100644
--- a/solvers/acoustics/okl/acousticsInitialCondition2D.okl
+++ b/solvers/acoustics/okl/acousticsInitialCondition2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/acoustics/okl/acousticsInitialCondition3D.okl b/solvers/acoustics/okl/acousticsInitialCondition3D.okl
index 23972ab5a..c2cc6f0f8 100644
--- a/solvers/acoustics/okl/acousticsInitialCondition3D.okl
+++ b/solvers/acoustics/okl/acousticsInitialCondition3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/acoustics/okl/acousticsSurfaceHex3D.okl b/solvers/acoustics/okl/acousticsSurfaceHex3D.okl
index b102d29a8..972d6c96a 100644
--- a/solvers/acoustics/okl/acousticsSurfaceHex3D.okl
+++ b/solvers/acoustics/okl/acousticsSurfaceHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -116,6 +116,7 @@ void surfaceTerms(const int e,
 
 // batch process elements
 @kernel void acousticsSurfaceHex3D(const dlong Nelements,
+                                  @restrict const  dlong  *  elementIds,
                                   @restrict const  dfloat *  sgeo,
                                   @restrict const  dfloat *  LIFT,
                                   @restrict const  dlong  *  vmapM,
@@ -131,63 +132,66 @@ void surfaceTerms(const int e,
   // for all elements
   for(dlong eo=0;eo<Nelements;eo+=p_NblockS;@outer(0)){
 
+    @exclusive dlong r_e, element;
+
     // for all face nodes of all elements
     // face 0 & 5
     for(int es=0;es<p_NblockS;++es;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
-          const dlong e = eo + es;
-          if(e<Nelements){
-            const dlong sk0 = e*p_Nfp*p_Nfaces + 0*p_Nfp + j*p_Nq + i;
-            const dlong sk5 = e*p_Nfp*p_Nfaces + 5*p_Nfp + j*p_Nq + i;
+          r_e = eo + es;
+          if(r_e<Nelements){
+            element = elementIds[r_e];
+
+            const dlong sk0 = element*p_Nfp*p_Nfaces + 0*p_Nfp + j*p_Nq + i;
+            const dlong sk5 = element*p_Nfp*p_Nfaces + 5*p_Nfp + j*p_Nq + i;
 
             //      surfaceTerms(sk0,0,i,j,0     );
-            surfaceTerms(e,sk0,0,i,j,0, sgeo, x, y, z, vmapM, vmapP, EToB, q, rhsq);
+            surfaceTerms(element,sk0,0,i,j,0, sgeo, x, y, z, vmapM, vmapP, EToB, q, rhsq);
 
             //surfaceTerms(sk5,5,i,j,(p_Nq-1));
-            surfaceTerms(e,sk5,5,i,j,(p_Nq-1), sgeo, x, y, z, vmapM, vmapP, EToB, q, rhsq);
+            surfaceTerms(element,sk5,5,i,j,(p_Nq-1), sgeo, x, y, z, vmapM, vmapP, EToB, q, rhsq);
           }
         }
       }
     }
 
-    @barrier("global");
+    /*Need barriers because surfaceTerms writes to global*/
+    @barrier();
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(2)){
       for(int k=0;k<p_Nq;++k;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
-          const dlong e = eo + es;
-          if(e<Nelements){
-            const dlong sk1 = e*p_Nfp*p_Nfaces + 1*p_Nfp + k*p_Nq + i;
-            const dlong sk3 = e*p_Nfp*p_Nfaces + 3*p_Nfp + k*p_Nq + i;
+          if(r_e<Nelements){
+            const dlong sk1 = element*p_Nfp*p_Nfaces + 1*p_Nfp + k*p_Nq + i;
+            const dlong sk3 = element*p_Nfp*p_Nfaces + 3*p_Nfp + k*p_Nq + i;
 
             //      surfaceTerms(sk1,1,i,0     ,k);
-            surfaceTerms(e,sk1,1,i,0,k, sgeo, x, y, z, vmapM, vmapP, EToB, q, rhsq);
+            surfaceTerms(element,sk1,1,i,0,k, sgeo, x, y, z, vmapM, vmapP, EToB, q, rhsq);
 
             //      surfaceTerms(sk3,3,i,(p_Nq-1),k);
-            surfaceTerms(e,sk3,3,i,(p_Nq-1),k, sgeo, x, y, z, vmapM, vmapP, EToB, q, rhsq);
+            surfaceTerms(element,sk3,3,i,(p_Nq-1),k, sgeo, x, y, z, vmapM, vmapP, EToB, q, rhsq);
           }
         }
       }
     }
 
-    @barrier("global");
+    @barrier();
 
     // face 2 & 4
     for(int es=0;es<p_NblockS;++es;@inner(2)){
       for(int k=0;k<p_Nq;++k;@inner(1)){
         for(int j=0;j<p_Nq;++j;@inner(0)){
-          const dlong e = eo + es;
-          if(e<Nelements){
-            const dlong sk2 = e*p_Nfp*p_Nfaces + 2*p_Nfp + k*p_Nq + j;
-            const dlong sk4 = e*p_Nfp*p_Nfaces + 4*p_Nfp + k*p_Nq + j;
+          if(r_e<Nelements){
+            const dlong sk2 = element*p_Nfp*p_Nfaces + 2*p_Nfp + k*p_Nq + j;
+            const dlong sk4 = element*p_Nfp*p_Nfaces + 4*p_Nfp + k*p_Nq + j;
 
             //      surfaceTerms(sk2,2,(p_Nq-1),j,k);
-            surfaceTerms(e,sk2,2,(p_Nq-1),j,k, sgeo, x, y, z, vmapM, vmapP, EToB, q, rhsq);
+            surfaceTerms(element,sk2,2,(p_Nq-1),j,k, sgeo, x, y, z, vmapM, vmapP, EToB, q, rhsq);
 
             //surfaceTerms(sk4,4,0     ,j,k);
-            surfaceTerms(e,sk4,4,0,j,k, sgeo, x, y, z, vmapM, vmapP, EToB, q, rhsq);
+            surfaceTerms(element,sk4,4,0,j,k, sgeo, x, y, z, vmapM, vmapP, EToB, q, rhsq);
           }
         }
       }
diff --git a/solvers/acoustics/okl/acousticsSurfaceQuad2D.okl b/solvers/acoustics/okl/acousticsSurfaceQuad2D.okl
index b75f1ad18..255449786 100644
--- a/solvers/acoustics/okl/acousticsSurfaceQuad2D.okl
+++ b/solvers/acoustics/okl/acousticsSurfaceQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -105,6 +105,7 @@ void surfaceTerms(const int e,
 
 // batch process elements
 @kernel void acousticsSurfaceQuad2D(const dlong Nelements,
+                                   @restrict const  dlong  *  elementIds,
                                    @restrict const  dfloat *  sgeo,
                                    @restrict const  dfloat *  LIFT,
                                    @restrict const  dlong  *  vmapM,
@@ -125,6 +126,8 @@ void surfaceTerms(const int e,
     @shared dfloat s_uflux[p_NblockS][p_Nq][p_Nq];
     @shared dfloat s_vflux[p_NblockS][p_Nq][p_Nq];
 
+    @exclusive dlong r_e, element;
+
     for(int es=0;es<p_NblockS;++es;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
         #pragma unroll p_Nq
@@ -136,59 +139,53 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("local");
-
     // for all face nodes of all elements
     // face 0 & 2
     for(int es=0;es<p_NblockS;++es;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
-        const dlong e = eo + es;
-        if(e<Nelements){
-          const dlong sk0 = e*p_Nfp*p_Nfaces + 0*p_Nfp + i;
-          const dlong sk2 = e*p_Nfp*p_Nfaces + 2*p_Nfp + i;
+        r_e = eo + es;
+        if(r_e<Nelements){
+          element = elementIds[r_e];
+
+          const dlong sk0 = element*p_Nfp*p_Nfaces + 0*p_Nfp + i;
+          const dlong sk2 = element*p_Nfp*p_Nfaces + 2*p_Nfp + i;
 
           //          surfaceTerms(sk0,0,i,0     );
-          surfaceTerms(e, es, sk0, 0, i, 0,
+          surfaceTerms(element, es, sk0, 0, i, 0,
                        sgeo, x, y, vmapM, vmapP, EToB, q, s_rflux, s_uflux, s_vflux);
 
           //      surfaceTerms(sk2,2,i,p_Nq-1);
-          surfaceTerms(e, es, sk2, 2, i, p_Nq-1,
+          surfaceTerms(element, es, sk2, 2, i, p_Nq-1,
                        sgeo, x, y, vmapM, vmapP, EToB, q, s_rflux, s_uflux, s_vflux);
         }
       }
     }
 
-    @barrier("local");
-
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(1)){
       for(int j=0;j<p_Nq;++j;@inner(0)){
-        const dlong e = eo + es;
-        if(e<Nelements){
-          const dlong sk1 = e*p_Nfp*p_Nfaces + 1*p_Nfp + j;
-          const dlong sk3 = e*p_Nfp*p_Nfaces + 3*p_Nfp + j;
+        if(r_e<Nelements){
+          const dlong sk1 = element*p_Nfp*p_Nfaces + 1*p_Nfp + j;
+          const dlong sk3 = element*p_Nfp*p_Nfaces + 3*p_Nfp + j;
 
           //          surfaceTerms(sk1,1,p_Nq-1,j);
-          surfaceTerms(e, es, sk1, 1, p_Nq-1, j,
+          surfaceTerms(element, es, sk1, 1, p_Nq-1, j,
                        sgeo, x, y, vmapM, vmapP, EToB, q, s_rflux, s_uflux, s_vflux);
 
           //surfaceTerms(sk3,3,0     ,j);
-          surfaceTerms(e, es, sk3, 3, 0, j,
+          surfaceTerms(element, es, sk3, 3, 0, j,
                        sgeo, x, y, vmapM, vmapP, EToB, q, s_rflux, s_uflux, s_vflux);
         }
       }
     }
 
-    @barrier("local");
-
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
-        const dlong e = eo + es;
-        if(e<Nelements){
+        if(r_e<Nelements){
           #pragma unroll p_Nq
             for(int j=0;j<p_Nq;++j){
-              const dlong base = e*p_Np*p_Nfields+j*p_Nq+i;
+              const dlong base = element*p_Np*p_Nfields+j*p_Nq+i;
               rhsq[base+0*p_Np] += s_rflux[es][j][i];
               rhsq[base+1*p_Np] += s_uflux[es][j][i];
               rhsq[base+2*p_Np] += s_vflux[es][j][i];
diff --git a/solvers/acoustics/okl/acousticsSurfaceTet3D.okl b/solvers/acoustics/okl/acousticsSurfaceTet3D.okl
index 53fed8b36..bb2e20ed8 100644
--- a/solvers/acoustics/okl/acousticsSurfaceTet3D.okl
+++ b/solvers/acoustics/okl/acousticsSurfaceTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -54,6 +54,7 @@ void upwind(const dfloat nx,
 
 // batch process elements
 @kernel void acousticsSurfaceTet3D(const dlong Nelements,
+                                  @restrict const  dlong  *  elementIds,
                                   @restrict const  dfloat *  sgeo,
                                   @restrict const  dfloat *  LIFT,
                                   @restrict const  dlong  *  vmapM,
@@ -75,17 +76,21 @@ void upwind(const dfloat nx,
     @shared dfloat s_vflux[p_NblockS][p_NfacesNfp];
     @shared dfloat s_wflux[p_NblockS][p_NfacesNfp];
 
+    @exclusive dlong r_e, element;
+
     // for all face nodes of all elements
     for(int es=0;es<p_NblockS;++es;@inner(1)){
       for(int n=0;n<p_maxNodes;++n;@inner(0)){ // maxNodes = max(Nfp*Nfaces,Np)
-        const dlong e = eo + es;
-        if(e<Nelements){
+        r_e = eo + es;
+        if(r_e<Nelements){
+          element = elementIds[r_e];
+
           if(n<p_NfacesNfp){
             // find face that owns this node
             const int face = n/p_Nfp;
 
             // load surface geofactors for this face
-            const dlong sid    = p_Nsgeo*(e*p_Nfaces+face);
+            const dlong sid    = p_Nsgeo*(element*p_Nfaces+face);
             const dfloat nx   = sgeo[sid+p_NXID];
             const dfloat ny   = sgeo[sid+p_NYID];
             const dfloat nz   = sgeo[sid+p_NZID];
@@ -93,12 +98,12 @@ void upwind(const dfloat nx,
             const dfloat invJ = sgeo[sid+p_IJID];
 
             // indices of negative and positive traces of face node
-            const dlong id  = e*p_Nfp*p_Nfaces + n;
+            const dlong id  = element*p_Nfp*p_Nfaces + n;
             const dlong idM = vmapM[id];
             const dlong idP = vmapP[id];
 
             // load traces
-            const dlong eM = e;
+            const dlong eM = element;
             const dlong eP = idP/p_Np;
             const int vidM = idM%p_Np;
             const int vidP = idP%p_Np;
@@ -106,18 +111,18 @@ void upwind(const dfloat nx,
             const dlong qbaseM = eM*p_Np*p_Nfields + vidM;
             const dlong qbaseP = eP*p_Np*p_Nfields + vidP;
 
-            const dfloat rM  = q[qbaseM + 0*p_Np];
+            const dfloat rM = q[qbaseM + 0*p_Np];
             const dfloat uM = q[qbaseM + 1*p_Np];
             const dfloat vM = q[qbaseM + 2*p_Np];
             const dfloat wM = q[qbaseM + 3*p_Np];
 
-            dfloat rP  = q[qbaseP + 0*p_Np];
+            dfloat rP = q[qbaseP + 0*p_Np];
             dfloat uP = q[qbaseP + 1*p_Np];
             dfloat vP = q[qbaseP + 2*p_Np];
             dfloat wP = q[qbaseP + 3*p_Np];
 
             // apply boundary condition
-            const int bc = EToB[face+p_Nfaces*e];
+            const int bc = EToB[face+p_Nfaces*element];
             if(bc>0){
               acousticsDirichletConditions3D(bc, time, x[idM], y[idM], z[idM], nx, ny, nz, rM, uM, vM, wM, &rP, &uP, &vP, &wP);
             }
@@ -129,7 +134,7 @@ void upwind(const dfloat nx,
 
             upwind(nx, ny, nz, rM, uM, vM, wM, rP, uP, vP, wP, &rflux, &uflux, &vflux, &wflux);
 
-            s_rflux[es][n]  = sc*(-rflux );
+            s_rflux[es][n] = sc*(-rflux );
             s_uflux[es][n] = sc*(-uflux);
             s_vflux[es][n] = sc*(-vflux);
             s_wflux[es][n] = sc*(-wflux);
@@ -138,14 +143,10 @@ void upwind(const dfloat nx,
       }
     }
 
-    // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
-
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
-        const dlong e = eo + es;
-        if(e<Nelements){
+        if(r_e<Nelements){
           if(n<p_Np){
             // load rhs data from volume fluxes
             dfloat Lrflux = 0.f, Luflux = 0.f, Lvflux = 0.f, Lwflux = 0.f;
@@ -160,7 +161,7 @@ void upwind(const dfloat nx,
                 Lwflux += L*s_wflux[es][m];
               }
 
-            const dlong base = e*p_Np*p_Nfields+n;
+            const dlong base = element*p_Np*p_Nfields+n;
             rhsq[base+0*p_Np] += Lrflux;
             rhsq[base+1*p_Np] += Luflux;
             rhsq[base+2*p_Np] += Lvflux;
diff --git a/solvers/acoustics/okl/acousticsSurfaceTri2D.okl b/solvers/acoustics/okl/acousticsSurfaceTri2D.okl
index 0929d1261..7e87ed75c 100644
--- a/solvers/acoustics/okl/acousticsSurfaceTri2D.okl
+++ b/solvers/acoustics/okl/acousticsSurfaceTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -48,6 +48,7 @@ void upwind(const dfloat nx,
 
 // batch process elements
 @kernel void acousticsSurfaceTri2D(const dlong Nelements,
+                                  @restrict const  dlong  *  elementIds,
                                   @restrict const  dfloat *  sgeo,
                                   @restrict const  dfloat *  LIFT,
                                   @restrict const  dlong  *  vmapM,
@@ -64,33 +65,37 @@ void upwind(const dfloat nx,
   for(dlong eo=0;eo<Nelements;eo+=p_NblockS;@outer(0)){
 
     // @shared storage for flux terms
-    @shared dfloat s_rflux [p_NblockS][p_NfacesNfp];
+    @shared dfloat s_rflux[p_NblockS][p_NfacesNfp];
     @shared dfloat s_uflux[p_NblockS][p_NfacesNfp];
     @shared dfloat s_vflux[p_NblockS][p_NfacesNfp];
 
+    @exclusive dlong r_e, element;
+
     // for all face nodes of all elements
     for(int es=0;es<p_NblockS;++es;@inner(1)){
       for(int n=0;n<p_maxNodes;++n;@inner(0)){ // maxNodes = max(Nfp*Nfaces,Np)
-        const dlong e = eo + es;
-        if(e<Nelements){
+        r_e = eo + es;
+        if(r_e<Nelements){
+          element = elementIds[r_e];
+
           if(n<p_NfacesNfp){
             // find face that owns this node
             const int face = n/p_Nfp;
 
             // load surface geofactors for this face
-            const dlong sid   = p_Nsgeo*(e*p_Nfaces+face);
+            const dlong sid   = p_Nsgeo*(element*p_Nfaces+face);
             const dfloat nx   = sgeo[sid+p_NXID];
             const dfloat ny   = sgeo[sid+p_NYID];
             const dfloat sJ   = sgeo[sid+p_SJID];
             const dfloat invJ = sgeo[sid+p_IJID];
 
             // indices of negative and positive traces of face node
-            const dlong id  = e*p_Nfp*p_Nfaces + n;
+            const dlong id  = element*p_Nfp*p_Nfaces + n;
             const dlong idM = vmapM[id];
             const dlong idP = vmapP[id];
 
             // load traces
-            const dlong eM = e;
+            const dlong eM = element;
             const dlong eP = idP/p_Np;
             const int vidM = idM%p_Np;
             const int vidP = idP%p_Np;
@@ -107,7 +112,7 @@ void upwind(const dfloat nx,
             dfloat vP = q[qbaseP + 2*p_Np];
 
             // apply boundary condition
-            const int bc = EToB[face+p_Nfaces*e];
+            const int bc = EToB[face+p_Nfaces*element];
             if(bc>0){
               acousticsDirichletConditions2D(bc, time, x[idM], y[idM], nx, ny, rM, uM, vM, &rP, &uP, &vP);
               //should also add the Neumann BC here, but need uxM, uyM, vxM, abd vyM somehow
@@ -131,14 +136,10 @@ void upwind(const dfloat nx,
       }
     }
 
-    // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
-
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
-        const dlong e = eo + es;
-        if(e<Nelements){
+        if(r_e<Nelements){
           if(n<p_Np){
             // load rhs data from volume fluxes
             dfloat Lrflux = 0.f, Luflux = 0.f, Lvflux = 0.f;
@@ -152,7 +153,7 @@ void upwind(const dfloat nx,
                 Lvflux += L*s_vflux[es][m];
               }
 
-            const dlong base = e*p_Np*p_Nfields+n;
+            const dlong base = element*p_Np*p_Nfields+n;
             rhsq[base+0*p_Np] += Lrflux;
             rhsq[base+1*p_Np] += Luflux;
             rhsq[base+2*p_Np] += Lvflux;
diff --git a/solvers/acoustics/okl/acousticsVolumeHex3D.okl b/solvers/acoustics/okl/acousticsVolumeHex3D.okl
index 5165ff89e..00e52731c 100644
--- a/solvers/acoustics/okl/acousticsVolumeHex3D.okl
+++ b/solvers/acoustics/okl/acousticsVolumeHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -108,8 +108,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
-
     for(int k=0;k<p_Nq;++k;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
diff --git a/solvers/acoustics/okl/acousticsVolumeQuad2D.okl b/solvers/acoustics/okl/acousticsVolumeQuad2D.okl
index fa453c1d4..9b293924b 100644
--- a/solvers/acoustics/okl/acousticsVolumeQuad2D.okl
+++ b/solvers/acoustics/okl/acousticsVolumeQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -81,8 +81,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
-
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
         const dlong gid = e*p_Np*p_Nvgeo+ j*p_Nq +i;
diff --git a/solvers/acoustics/okl/acousticsVolumeTet3D.okl b/solvers/acoustics/okl/acousticsVolumeTet3D.okl
index 0ab1901fa..efd9770dd 100644
--- a/solvers/acoustics/okl/acousticsVolumeTet3D.okl
+++ b/solvers/acoustics/okl/acousticsVolumeTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -103,8 +103,6 @@ SOFTWARE.
 
     }
 
-    @barrier("local");
-
     for(int n=0;n<p_Np;++n;@inner(0)){
 
       dfloat rhsq0 = 0, rhsq1 = 0, rhsq2 = 0, rhsq3 = 0;
@@ -155,8 +153,6 @@ SOFTWARE.
       s_w[n] = q[qbase+3*p_Np];
     }
 
-    @barrier("local");
-
     for(int n=0;n<p_Np;++n;@inner(0)){
 
       // prefetch geometric factors (constant on triangle)
@@ -246,8 +242,6 @@ SOFTWARE.
         }
     }
 
-    @barrier("local");
-
     for(int n=0;n<p_Np;++n;@inner(0)){
 
       dfloat r_drhodr[p_Nvol], r_drhods[p_Nvol], r_drhodt[p_Nvol];
@@ -372,8 +366,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
-
     for(int et=0;et<p_NblockV;++et;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
 
diff --git a/solvers/acoustics/okl/acousticsVolumeTri2D.okl b/solvers/acoustics/okl/acousticsVolumeTri2D.okl
index bd6e0b227..55fb58a90 100644
--- a/solvers/acoustics/okl/acousticsVolumeTri2D.okl
+++ b/solvers/acoustics/okl/acousticsVolumeTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -72,8 +72,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
-
     for(int n=0;n<p_Np;++n;@inner(0)){
 
       dfloat rhsq0 = 0, rhsq1 = 0, rhsq2 = 0;
diff --git a/solvers/acoustics/src/acousticsPlotFields.cpp b/solvers/acoustics/src/acousticsPlotFields.cpp
index c982294c5..0f1eaf72a 100644
--- a/solvers/acoustics/src/acousticsPlotFields.cpp
+++ b/solvers/acoustics/src/acousticsPlotFields.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,11 +27,11 @@ SOFTWARE.
 #include "acoustics.hpp"
 
 // interpolate data to plot nodes and save to file (one per process
-void acoustics_t::PlotFields(dfloat* Q, char *fileName){
+void acoustics_t::PlotFields(memory<dfloat> Q, const std::string fileName){
 
   FILE *fp;
 
-  fp = fopen(fileName, "w");
+  fp = fopen(fileName.c_str(), "w");
 
   fprintf(fp, "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"BigEndian\">\n");
   fprintf(fp, "  <UnstructuredGrid>\n");
@@ -44,33 +44,39 @@ void acoustics_t::PlotFields(dfloat* Q, char *fileName){
   fprintf(fp, "        <DataArray type=\"Float32\" NumberOfComponents=\"3\" Format=\"ascii\">\n");
 
   //scratch space for interpolation
-  size_t NscratchBytes = mymax(mesh.Np, mesh.plotNp)*sizeof(dfloat);
-  dfloat* scratch = (dfloat *) malloc(2*NscratchBytes);
+  size_t Nscratch = std::max(mesh.Np, mesh.plotNp);
+  memory<dfloat> scratch(2*Nscratch);
 
-  dfloat* Ix = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iy = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iz = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
+  memory<dfloat> Ix(mesh.plotNp);
+  memory<dfloat> Iy(mesh.plotNp);
+  memory<dfloat> Iz(mesh.plotNp);
 
   // compute plot node coordinates on the fly
   for(dlong e=0;e<mesh.Nelements;++e){
     mesh.PlotInterp(mesh.x + e*mesh.Np, Ix, scratch);
     mesh.PlotInterp(mesh.y + e*mesh.Np, Iy, scratch);
-    mesh.PlotInterp(mesh.z + e*mesh.Np, Iz, scratch);
+    if(mesh.dim==3)
+      mesh.PlotInterp(mesh.z + e*mesh.Np, Iz, scratch);
 
-    for(int n=0;n<mesh.plotNp;++n){
-      fprintf(fp, "       ");
-      fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],Iz[n]);
+    if (mesh.dim==2) {
+      for(int n=0;n<mesh.plotNp;++n){
+        fprintf(fp, "       ");
+        fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],0.0);
+      }
+    } else {
+      for(int n=0;n<mesh.plotNp;++n){
+        fprintf(fp, "       ");
+        fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],Iz[n]);
+      }
     }
   }
   fprintf(fp, "        </DataArray>\n");
   fprintf(fp, "      </Points>\n");
 
-  free(Ix); free(Iy); free(Iz);
-
-  dfloat* Ip = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iu = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iv = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iw = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
+  memory<dfloat> Ip(mesh.plotNp);
+  memory<dfloat> Iu(mesh.plotNp);
+  memory<dfloat> Iv(mesh.plotNp);
+  memory<dfloat> Iw(mesh.plotNp);
 
   // write out density
   fprintf(fp, "      <PointData Scalars=\"scalars\">\n");
@@ -105,8 +111,6 @@ void acoustics_t::PlotFields(dfloat* Q, char *fileName){
   fprintf(fp, "       </DataArray>\n");
   fprintf(fp, "     </PointData>\n");
 
-  free(Ip); free(Iu); free(Iv); free(Iw);
-
   fprintf(fp, "    <Cells>\n");
   fprintf(fp, "      <DataArray type=\"Int32\" Name=\"connectivity\" Format=\"ascii\">\n");
 
@@ -147,6 +151,4 @@ void acoustics_t::PlotFields(dfloat* Q, char *fileName){
   fprintf(fp, "  </UnstructuredGrid>\n");
   fprintf(fp, "</VTKFile>\n");
   fclose(fp);
-
-  free(scratch);
 }
diff --git a/solvers/acoustics/src/acousticsReport.cpp b/solvers/acoustics/src/acousticsReport.cpp
index b5d1876ba..cf7913a32 100644
--- a/solvers/acoustics/src/acousticsReport.cpp
+++ b/solvers/acoustics/src/acousticsReport.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -34,7 +34,7 @@ void acoustics_t::Report(dfloat time, int tstep){
   mesh.MassMatrixApply(o_q, o_Mq);
 
   dlong Nentries = mesh.Nelements*mesh.Np*Nfields;
-  dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_q, o_Mq, mesh.comm));
+  dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_q, o_Mq, mesh.comm));
 
   if(mesh.rank==0)
     printf("%5.2f (%d), %5.2f (time, timestep, norm)\n", time, tstep, norm2);
@@ -45,11 +45,11 @@ void acoustics_t::Report(dfloat time, int tstep){
     o_q.copyTo(q);
 
     // output field files
-    string name;
+    std::string name;
     settings.getSetting("OUTPUT FILE NAME", name);
     char fname[BUFSIZ];
     sprintf(fname, "%s_%04d_%04d.vtu", name.c_str(), mesh.rank, frame++);
 
-    PlotFields(q, fname);
+    PlotFields(q, std::string(fname));
   }
 }
diff --git a/solvers/acoustics/src/acousticsRun.cpp b/solvers/acoustics/src/acousticsRun.cpp
index 188a7ef17..ee908a769 100644
--- a/solvers/acoustics/src/acousticsRun.cpp
+++ b/solvers/acoustics/src/acousticsRun.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -47,9 +47,9 @@ void acoustics_t::Run(){
   dfloat vmax = MaxWaveSpeed();
 
   dfloat dt = cfl*hmin/(vmax*(mesh.N+1.)*(mesh.N+1.));
-  timeStepper->SetTimeStep(dt);
+  timeStepper.SetTimeStep(dt);
 
-  timeStepper->Run(o_q, startTime, finalTime);
+  timeStepper.Run(*this, o_q, startTime, finalTime);
 
   // output norm of final solution
   {
@@ -57,7 +57,7 @@ void acoustics_t::Run(){
     mesh.MassMatrixApply(o_q, o_Mq);
 
     dlong Nentries = mesh.Nelements*mesh.Np*Nfields;
-    dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_q, o_Mq, mesh.comm));
+    dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_q, o_Mq, mesh.comm));
 
     if(mesh.rank==0)
       printf("Solution norm = %17.15lg\n", norm2);
diff --git a/solvers/acoustics/src/acousticsSettings.cpp b/solvers/acoustics/src/acousticsSettings.cpp
index 172b2962d..9198e3261 100644
--- a/solvers/acoustics/src/acousticsSettings.cpp
+++ b/solvers/acoustics/src/acousticsSettings.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,7 +27,7 @@ SOFTWARE.
 #include "acoustics.hpp"
 
 //settings for acoustics solver
-acousticsSettings_t::acousticsSettings_t(MPI_Comm& _comm):
+acousticsSettings_t::acousticsSettings_t(comm_t _comm):
   settings_t(_comm) {
 
   newSetting("DATA FILE",
@@ -66,10 +66,7 @@ acousticsSettings_t::acousticsSettings_t(MPI_Comm& _comm):
 
 void acousticsSettings_t::report() {
 
-  int rank;
-  MPI_Comm_rank(comm, &rank);
-
-  if (rank==0) {
+  if (comm.rank()==0) {
     std::cout << "Acoustics Settings:\n\n";
     reportSetting("DATA FILE");
     reportSetting("TIME INTEGRATOR");
@@ -83,15 +80,15 @@ void acousticsSettings_t::report() {
 
 void acousticsSettings_t::parseFromFile(platformSettings_t& platformSettings,
                                   meshSettings_t& meshSettings,
-                                  const string filename) {
+                                  const std::string filename) {
   //read all settings from file
   settings_t s(comm);
   s.readSettingsFromFile(filename);
 
   for(auto it = s.settings.begin(); it != s.settings.end(); ++it) {
-    setting_t* set = it->second;
-    const string name = set->getName();
-    const string val = set->getVal<string>();
+    setting_t& set = it->second;
+    const std::string name = set.getName();
+    const std::string val = set.getVal<std::string>();
     if (platformSettings.hasSetting(name))
       platformSettings.changeSetting(name, val);
     else if (meshSettings.hasSetting(name))
@@ -99,9 +96,7 @@ void acousticsSettings_t::parseFromFile(platformSettings_t& platformSettings,
     else if (hasSetting(name)) //self
       changeSetting(name, val);
     else  {
-      stringstream ss;
-      ss << "Unknown setting: [" << name << "] requested";
-      LIBP_ABORT(ss.str());
+      LIBP_FORCE_ABORT("Unknown setting: [" << name << "] requested");
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/acoustics/src/acousticsSetup.cpp b/solvers/acoustics/src/acousticsSetup.cpp
index 1de7585ec..4dea74a82 100644
--- a/solvers/acoustics/src/acousticsSetup.cpp
+++ b/solvers/acoustics/src/acousticsSetup.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,120 +26,118 @@ SOFTWARE.
 
 #include "acoustics.hpp"
 
-acoustics_t& acoustics_t::Setup(platform_t& platform, mesh_t& mesh,
-                                acousticsSettings_t& settings){
+void acoustics_t::Setup(platform_t& _platform, mesh_t& _mesh,
+                        acousticsSettings_t& _settings){
 
-  acoustics_t* acoustics = new acoustics_t(platform, mesh, settings);
+  platform = _platform;
+  mesh = _mesh;
+  comm = _mesh.comm;
+  settings = _settings;
 
-  acoustics->Nfields = (mesh.dim==3) ? 4:3;
+  Nfields = (mesh.dim==3) ? 4:3;
 
-  dlong Nlocal = mesh.Nelements*mesh.Np*acoustics->Nfields;
-  dlong Nhalo  = mesh.totalHaloPairs*mesh.Np*acoustics->Nfields;
+  dlong Nlocal = mesh.Nelements*mesh.Np*Nfields;
+  dlong Nhalo  = mesh.totalHaloPairs*mesh.Np*Nfields;
+
+  //Trigger JIT kernel builds
+  ogs::InitializeKernels(platform, ogs::Dfloat, ogs::Add);
+
+  //setup linear algebra module
+  platform.linAlg().InitKernels({"innerProd"});
+
+  /*setup trace halo exchange */
+  traceHalo = mesh.HaloTraceSetup(Nfields);
 
   //setup timeStepper
   if (settings.compareSetting("TIME INTEGRATOR","AB3")){
-    acoustics->timeStepper = new TimeStepper::ab3(mesh.Nelements, mesh.totalHaloPairs,
-                                              mesh.Np, acoustics->Nfields, *acoustics);
+    timeStepper.Setup<TimeStepper::ab3>(mesh.Nelements,
+                                        mesh.totalHaloPairs,
+                                        mesh.Np, Nfields, platform, comm);
   } else if (settings.compareSetting("TIME INTEGRATOR","LSERK4")){
-    acoustics->timeStepper = new TimeStepper::lserk4(mesh.Nelements, mesh.totalHaloPairs,
-                                              mesh.Np, acoustics->Nfields, *acoustics);
+    timeStepper.Setup<TimeStepper::lserk4>(mesh.Nelements,
+                                           mesh.totalHaloPairs,
+                                           mesh.Np, Nfields, platform, comm);
   } else if (settings.compareSetting("TIME INTEGRATOR","DOPRI5")){
-    acoustics->timeStepper = new TimeStepper::dopri5(mesh.Nelements, mesh.totalHaloPairs,
-                                              mesh.Np, acoustics->Nfields, *acoustics, mesh.comm);
+    timeStepper.Setup<TimeStepper::dopri5>(mesh.Nelements,
+                                           mesh.totalHaloPairs,
+                                           mesh.Np, Nfields, platform, comm);
   }
 
-  //setup linear algebra module
-  platform.linAlg.InitKernels({"innerProd"});
-
   // set penalty parameter
   dfloat Lambda2 = 0.5;
 
-  /*setup trace halo exchange */
-  acoustics->traceHalo = mesh.HaloTraceSetup(acoustics->Nfields);
-
   // compute samples of q at interpolation nodes
-  acoustics->q = (dfloat*) calloc(Nlocal+Nhalo, sizeof(dfloat));
-  acoustics->o_q = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), acoustics->q);
+  q.malloc(Nlocal+Nhalo);
+  o_q = platform.malloc<dfloat>(q);
 
   //storage for M*q during reporting
-  acoustics->o_Mq = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), acoustics->q);
-  mesh.MassMatrixKernelSetup(acoustics->Nfields); // mass matrix operator
+  o_Mq = platform.malloc<dfloat>(q);
+  mesh.MassMatrixKernelSetup(Nfields); // mass matrix operator
 
   // OCCA build stuff
-  occa::properties kernelInfo = mesh.props; //copy base occa properties
+  properties_t kernelInfo = mesh.props; //copy base occa properties
 
   //add boundary data to kernel info
-  string dataFileName;
+  std::string dataFileName;
   settings.getSetting("DATA FILE", dataFileName);
   kernelInfo["includes"] += dataFileName;
 
-
-  kernelInfo["defines/" "p_Nfields"]= acoustics->Nfields;
+  kernelInfo["defines/" "p_Nfields"]= Nfields;
 
   const dfloat p_half = 1./2.;
   kernelInfo["defines/" "p_half"]= p_half;
 
-  int maxNodes = mymax(mesh.Np, (mesh.Nfp*mesh.Nfaces));
+  int maxNodes = std::max(mesh.Np, (mesh.Nfp*mesh.Nfaces));
   kernelInfo["defines/" "p_maxNodes"]= maxNodes;
 
   int blockMax = 256;
   if (platform.device.mode() == "CUDA") blockMax = 512;
 
-  int NblockV = mymax(1, blockMax/mesh.Np);
+  int NblockV = std::max(1, blockMax/mesh.Np);
   kernelInfo["defines/" "p_NblockV"]= NblockV;
 
-  int NblockS = mymax(1, blockMax/maxNodes);
+  int NblockS = std::max(1, blockMax/maxNodes);
   kernelInfo["defines/" "p_NblockS"]= NblockS;
 
   kernelInfo["defines/" "p_Lambda2"]= Lambda2;
 
-  kernelInfo["parser/" "automate-add-barriers"] =  "disabled";
-
   // set kernel name suffix
-  char *suffix;
-  if(mesh.elementType==TRIANGLES)
-    suffix = strdup("Tri2D");
-  if(mesh.elementType==QUADRILATERALS)
-    suffix = strdup("Quad2D");
-  if(mesh.elementType==TETRAHEDRA)
-    suffix = strdup("Tet3D");
-  if(mesh.elementType==HEXAHEDRA)
-    suffix = strdup("Hex3D");
-
-  char fileName[BUFSIZ], kernelName[BUFSIZ];
+  std::string suffix;
+  if(mesh.elementType==Mesh::TRIANGLES)
+    suffix = "Tri2D";
+  if(mesh.elementType==Mesh::QUADRILATERALS)
+    suffix = "Quad2D";
+  if(mesh.elementType==Mesh::TETRAHEDRA)
+    suffix = "Tet3D";
+  if(mesh.elementType==Mesh::HEXAHEDRA)
+    suffix = "Hex3D";
+
+  std::string oklFilePrefix = DACOUSTICS "/okl/";
+  std::string oklFileSuffix = ".okl";
+
+  std::string fileName, kernelName;
 
   // kernels from volume file
-  sprintf(fileName, DACOUSTICS "/okl/acousticsVolume%s.okl", suffix);
-  sprintf(kernelName, "acousticsVolume%s", suffix);
+  fileName   = oklFilePrefix + "acousticsVolume" + suffix + oklFileSuffix;
+  kernelName = "acousticsVolume" + suffix;
 
-  acoustics->volumeKernel =  platform.buildKernel(fileName, kernelName,
+  volumeKernel =  platform.buildKernel(fileName, kernelName,
                                          kernelInfo);
   // kernels from surface file
-  sprintf(fileName, DACOUSTICS "/okl/acousticsSurface%s.okl", suffix);
-  sprintf(kernelName, "acousticsSurface%s", suffix);
+  fileName   = oklFilePrefix + "acousticsSurface" + suffix + oklFileSuffix;
+  kernelName = "acousticsSurface" + suffix;
 
-  acoustics->surfaceKernel = platform.buildKernel(fileName, kernelName,
+  surfaceKernel = platform.buildKernel(fileName, kernelName,
                                          kernelInfo);
 
   if (mesh.dim==2) {
-    sprintf(fileName, DACOUSTICS "/okl/acousticsInitialCondition2D.okl");
-    sprintf(kernelName, "acousticsInitialCondition2D");
+    fileName   = oklFilePrefix + "acousticsInitialCondition2D" + oklFileSuffix;
+    kernelName = "acousticsInitialCondition2D";
   } else {
-    sprintf(fileName, DACOUSTICS "/okl/acousticsInitialCondition3D.okl");
-    sprintf(kernelName, "acousticsInitialCondition3D");
+    fileName   = oklFilePrefix + "acousticsInitialCondition3D" + oklFileSuffix;
+    kernelName = "acousticsInitialCondition3D";
   }
 
-  acoustics->initialConditionKernel = platform.buildKernel(fileName, kernelName,
+  initialConditionKernel = platform.buildKernel(fileName, kernelName,
                                                   kernelInfo);
-
-  return *acoustics;
-}
-
-acoustics_t::~acoustics_t() {
-  volumeKernel.free();
-  surfaceKernel.free();
-  initialConditionKernel.free();
-
-  if (timeStepper) delete timeStepper;
-  if (traceHalo) traceHalo->Free();
 }
diff --git a/solvers/acoustics/src/acousticsStep.cpp b/solvers/acoustics/src/acousticsStep.cpp
index 5efb33271..a2d040a7e 100644
--- a/solvers/acoustics/src/acousticsStep.cpp
+++ b/solvers/acoustics/src/acousticsStep.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -33,10 +33,10 @@ dfloat acoustics_t::MaxWaveSpeed(){
 }
 
 //evaluate ODE rhs = f(q,t)
-void acoustics_t::rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){
+void acoustics_t::rhsf(deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS, const dfloat T){
 
   // extract q halo on DEVICE
-  traceHalo->ExchangeStart(o_Q, 1, ogs_dfloat);
+  traceHalo.ExchangeStart(o_Q, 1);
 
   volumeKernel(mesh.Nelements,
                mesh.o_vgeo,
@@ -44,18 +44,35 @@ void acoustics_t::rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){
                o_Q,
                o_RHS);
 
-  traceHalo->ExchangeFinish(o_Q, 1, ogs_dfloat);
-
-  surfaceKernel(mesh.Nelements,
-                mesh.o_sgeo,
-                mesh.o_LIFT,
-                mesh.o_vmapM,
-                mesh.o_vmapP,
-                mesh.o_EToB,
-                T,
-                mesh.o_x,
-                mesh.o_y,
-                mesh.o_z,
-                o_Q,
-                o_RHS);
+  if (mesh.NinternalElements)
+    surfaceKernel(mesh.NinternalElements,
+                  mesh.o_internalElementIds,
+                  mesh.o_sgeo,
+                  mesh.o_LIFT,
+                  mesh.o_vmapM,
+                  mesh.o_vmapP,
+                  mesh.o_EToB,
+                  T,
+                  mesh.o_x,
+                  mesh.o_y,
+                  mesh.o_z,
+                  o_Q,
+                  o_RHS);
+
+  traceHalo.ExchangeFinish(o_Q, 1);
+
+  if (mesh.NhaloElements)
+    surfaceKernel(mesh.NhaloElements,
+                  mesh.o_haloElementIds,
+                  mesh.o_sgeo,
+                  mesh.o_LIFT,
+                  mesh.o_vmapM,
+                  mesh.o_vmapP,
+                  mesh.o_EToB,
+                  T,
+                  mesh.o_x,
+                  mesh.o_y,
+                  mesh.o_z,
+                  o_Q,
+                  o_RHS);
 }
diff --git a/solvers/advection/advection.hpp b/solvers/advection/advection.hpp
index 5ec62bbec..62d5ed434 100644
--- a/solvers/advection/advection.hpp
+++ b/solvers/advection/advection.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -36,53 +36,54 @@ SOFTWARE.
 
 #define DADVECTION LIBP_DIR"/solvers/advection/"
 
+using namespace libp;
+
 class advectionSettings_t: public settings_t {
 public:
-  advectionSettings_t(MPI_Comm& _comm);
+  advectionSettings_t(comm_t _comm);
   void report();
   void parseFromFile(platformSettings_t& platformSettings,
                      meshSettings_t& meshSettings,
-                     const string filename);
+                     const std::string filename);
 };
 
 class advection_t: public solver_t {
 public:
-  mesh_t &mesh;
-  TimeStepper::timeStepper_t* timeStepper;
+  mesh_t mesh;
+  timeStepper_t timeStepper;
 
-  halo_t* traceHalo;
+  ogs::halo_t traceHalo;
 
-  dfloat *q;
-  occa::memory o_q;
+  memory<dfloat> q;
+  deviceMemory<dfloat> o_q;
 
-  occa::memory o_Mq;
+  deviceMemory<dfloat> o_Mq;
 
-  occa::kernel volumeKernel;
-  occa::kernel surfaceKernel;
+  kernel_t volumeKernel;
+  kernel_t surfaceKernel;
 
-  occa::kernel initialConditionKernel;
-  occa::kernel maxWaveSpeedKernel;
+  kernel_t initialConditionKernel;
+  kernel_t maxWaveSpeedKernel;
 
-  advection_t() = delete;
+  advection_t() = default;
   advection_t(platform_t &_platform, mesh_t &_mesh,
-              advectionSettings_t& _settings):
-    solver_t(_platform, _settings), mesh(_mesh) {}
-
-  ~advection_t();
+              advectionSettings_t& _settings) {
+    Setup(_platform, _mesh, _settings);
+  }
 
   //setup
-  static advection_t& Setup(platform_t& platform, mesh_t& mesh,
-                            advectionSettings_t& settings);
+  void Setup(platform_t& platform, mesh_t& mesh,
+             advectionSettings_t& settings);
 
   void Run();
 
   void Report(dfloat time, int tstep);
 
-  void PlotFields(dfloat* Q, char *fileName);
+  void PlotFields(memory<dfloat> Q, const std::string fileName);
 
-  void rhsf(occa::memory& o_q, occa::memory& o_rhs, const dfloat time);
+  void rhsf(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_rhs, const dfloat time);
 
-  dfloat MaxWaveSpeed(occa::memory& o_Q, const dfloat T);
+  dfloat MaxWaveSpeed(deviceMemory<dfloat>& o_Q, const dfloat T);
 };
 
 #endif
diff --git a/solvers/advection/advectionMain.cpp b/solvers/advection/advectionMain.cpp
index e45076f2e..ba7c77ff6 100644
--- a/solvers/advection/advectionMain.cpp
+++ b/solvers/advection/advectionMain.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -29,38 +29,39 @@ SOFTWARE.
 int main(int argc, char **argv){
 
   // start up MPI
-  MPI_Init(&argc, &argv);
+  Comm::Init(argc, argv);
 
-  MPI_Comm comm = MPI_COMM_WORLD;
+  LIBP_ABORT("Usage: ./advectionMain setupfile", argc!=2);
 
-  if(argc!=2)
-    LIBP_ABORT(string("Usage: ./advectionMain setupfile"));
+  { /*Scope so everything is destructed before MPI_Finalize */
+    comm_t comm(Comm::World().Dup());
 
-  //create default settings
-  platformSettings_t platformSettings(comm);
-  meshSettings_t meshSettings(comm);
-  advectionSettings_t advectionSettings(comm);
+    //create default settings
+    platformSettings_t platformSettings(comm);
+    meshSettings_t meshSettings(comm);
+    advectionSettings_t advectionSettings(comm);
 
-  //load settings from file
-  advectionSettings.parseFromFile(platformSettings, meshSettings, argv[1]);
+    //load settings from file
+    advectionSettings.parseFromFile(platformSettings, meshSettings, argv[1]);
 
-  // set up platform
-  platform_t platform(platformSettings);
+    // set up platform
+    platform_t platform(platformSettings);
 
-  platformSettings.report();
-  meshSettings.report();
-  advectionSettings.report();
+    platformSettings.report();
+    meshSettings.report();
+    advectionSettings.report();
 
-  // set up mesh
-  mesh_t& mesh = mesh_t::Setup(platform, meshSettings, comm);
+    // set up mesh
+    mesh_t mesh(platform, meshSettings, comm);
 
-  // set up advection solver
-  advection_t& advection = advection_t::Setup(platform, mesh, advectionSettings);
+    // set up advection solver
+    advection_t advection(platform, mesh, advectionSettings);
 
-  // run
-  advection.Run();
+    // run
+    advection.Run();
+  }
 
   // close down MPI
-  MPI_Finalize();
+  Comm::Finalize();
   return LIBP_SUCCESS;
 }
diff --git a/solvers/advection/data/advectionLinear2D.h b/solvers/advection/data/advectionLinear2D.h
index eab471a34..03be74d4c 100644
--- a/solvers/advection/data/advectionLinear2D.h
+++ b/solvers/advection/data/advectionLinear2D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/advection/data/advectionLinear3D.h b/solvers/advection/data/advectionLinear3D.h
index 1dfeaa752..9234bc3dd 100644
--- a/solvers/advection/data/advectionLinear3D.h
+++ b/solvers/advection/data/advectionLinear3D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/advection/makefile b/solvers/advection/makefile
index 34add97f0..497f3157c 100644
--- a/solvers/advection/makefile
+++ b/solvers/advection/makefile
@@ -2,7 +2,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
@@ -78,11 +78,8 @@ include ../../make.top
 endif
 endif
 
-#gslib
-GS_DIR=${LIBP_TPL_DIR}/gslib
-
 #libraries
-ADVECTION_LIBP_LIBS=timeStepper mesh ogs linAlg core
+ADVECTION_LIBP_LIBS=timeStepper mesh parAdogs ogs linAlg core
 
 #includes
 INCLUDES=${LIBP_INCLUDES} \
@@ -93,11 +90,10 @@ DEFINES =${LIBP_DEFINES} \
          -DLIBP_DIR='"${LIBP_DIR}"'
 
 #.cpp compilation flags
-ADVECTION_CXXFLAGS=${LIBP_MPICXXFLAGS} ${DEFINES} ${INCLUDES}
+ADVECTION_CXXFLAGS=${LIBP_CXXFLAGS} ${DEFINES} ${INCLUDES}
 
 #link libraries
 LIBS=-L${LIBP_LIBS_DIR} $(addprefix -l,$(ADVECTION_LIBP_LIBS)) \
-     -L$(GS_DIR)/lib -lgs \
      ${LIBP_LIBS}
 
 #link flags
@@ -145,10 +141,10 @@ endif
 # rule for .cpp files
 %.o: %.cpp $(DEPS) | libp_libs
 ifneq (,${verbose})
-	$(LIBP_MPICXX) -o $*.o -c $*.cpp $(ADVECTION_CXXFLAGS)
+	$(LIBP_CXX) -o $*.o -c $*.cpp $(ADVECTION_CXXFLAGS)
 else
 	@printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n";
-	@$(LIBP_MPICXX) -o $*.o -c $*.cpp $(ADVECTION_CXXFLAGS)
+	@$(LIBP_CXX) -o $*.o -c $*.cpp $(ADVECTION_CXXFLAGS)
 endif
 
 #cleanup
@@ -159,8 +155,7 @@ clean-libs: clean
 	${MAKE} -C ${LIBP_LIBS_DIR} clean
 
 clean-kernels: clean-libs
-# 	$(shell ${OCCA_DIR}/bin/occa clear all -y)
-	rm -rf ~/.occa/
+	rm -rf ${LIBP_DIR}/.occa/
 
 realclean: clean
 	${MAKE} -C ${LIBP_LIBS_DIR} realclean
diff --git a/solvers/advection/okl/advectionInitialCondition2D.okl b/solvers/advection/okl/advectionInitialCondition2D.okl
index bc73181a7..b4f171be6 100644
--- a/solvers/advection/okl/advectionInitialCondition2D.okl
+++ b/solvers/advection/okl/advectionInitialCondition2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/advection/okl/advectionInitialCondition3D.okl b/solvers/advection/okl/advectionInitialCondition3D.okl
index aeb36efcb..c7e6961b5 100644
--- a/solvers/advection/okl/advectionInitialCondition3D.okl
+++ b/solvers/advection/okl/advectionInitialCondition3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/advection/okl/advectionMaxWaveSpeedHex3D.okl b/solvers/advection/okl/advectionMaxWaveSpeedHex3D.okl
index b4334d389..4660d4beb 100644
--- a/solvers/advection/okl/advectionMaxWaveSpeedHex3D.okl
+++ b/solvers/advection/okl/advectionMaxWaveSpeedHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,7 +25,7 @@ SOFTWARE.
 */
 
 @kernel void advectionMaxWaveSpeedHex3D(const dlong Nelements,
-                                  @restrict const  dfloat *  vgeo,
+                                  @restrict const  dfloat *  wJ,
                                   @restrict const  dfloat *  sgeo,
                                   @restrict const  dlong  *  vmapM,
                                   @restrict const  int    *  EToB,
@@ -52,7 +52,7 @@ SOFTWARE.
       #pragma unroll p_Nq
       for(int k=0;k<p_Nq;++k){
         //sum jacobians to find element volume
-        s_J[n] += vgeo[p_Nvgeo*p_Np*e + k*p_Nq*p_Nq + n + p_Np*p_JWID];
+        s_J[n] += wJ[p_Np*e + k*p_Nq*p_Nq + n];
 
         //find max wavespeed
         const dlong id = e*p_Np+k*p_Nfp+n;
diff --git a/solvers/advection/okl/advectionMaxWaveSpeedQuad2D.okl b/solvers/advection/okl/advectionMaxWaveSpeedQuad2D.okl
index 65c99b56b..313982bda 100644
--- a/solvers/advection/okl/advectionMaxWaveSpeedQuad2D.okl
+++ b/solvers/advection/okl/advectionMaxWaveSpeedQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,7 +25,7 @@ SOFTWARE.
 */
 
 @kernel void advectionMaxWaveSpeedQuad2D(const dlong Nelements,
-                                  @restrict const  dfloat *  vgeo,
+                                  @restrict const  dfloat *  wJ,
                                   @restrict const  dfloat *  sgeo,
                                   @restrict const  dlong  *  vmapM,
                                   @restrict const  int    *  EToB,
@@ -52,7 +52,7 @@ SOFTWARE.
       #pragma unroll p_Nq
       for(int j=0;j<p_Nq;++j){
         //sum jacobians to find element volume
-        s_J[i] += vgeo[p_Nvgeo*p_Np*e + j*p_Nq+i + p_Np*p_JWID];
+        s_J[i] += wJ[p_Np*e + j*p_Nq+i];
 
         //find max wavespeed
         const dlong id = e*p_Np+j*p_Nq+i;
diff --git a/solvers/advection/okl/advectionMaxWaveSpeedTet3D.okl b/solvers/advection/okl/advectionMaxWaveSpeedTet3D.okl
index 82db4022d..a75bda374 100644
--- a/solvers/advection/okl/advectionMaxWaveSpeedTet3D.okl
+++ b/solvers/advection/okl/advectionMaxWaveSpeedTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,7 +25,7 @@ SOFTWARE.
 */
 
 @kernel void advectionMaxWaveSpeedTet3D(const dlong Nelements,
-                                  @restrict const  dfloat *  vgeo,
+                                  @restrict const  dfloat *  wJ,
                                   @restrict const  dfloat *  sgeo,
                                   @restrict const  dlong  *  vmapM,
                                   @restrict const  int    *  EToB,
diff --git a/solvers/advection/okl/advectionMaxWaveSpeedTri2D.okl b/solvers/advection/okl/advectionMaxWaveSpeedTri2D.okl
index bf0f34ce6..9250ba08f 100644
--- a/solvers/advection/okl/advectionMaxWaveSpeedTri2D.okl
+++ b/solvers/advection/okl/advectionMaxWaveSpeedTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,7 +25,7 @@ SOFTWARE.
 */
 
 @kernel void advectionMaxWaveSpeedTri2D(const dlong Nelements,
-                                  @restrict const  dfloat *  vgeo,
+                                  @restrict const  dfloat *  wJ,
                                   @restrict const  dfloat *  sgeo,
                                   @restrict const  dlong  *  vmapM,
                                   @restrict const  int    *  EToB,
diff --git a/solvers/advection/okl/advectionSurfaceHex3D.okl b/solvers/advection/okl/advectionSurfaceHex3D.okl
index f2c831c17..88297ce49 100644
--- a/solvers/advection/okl/advectionSurfaceHex3D.okl
+++ b/solvers/advection/okl/advectionSurfaceHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -117,7 +117,8 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("global");
+    /*Need barriers because surfaceTerms writes to global*/
+    @barrier();
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(2)){
@@ -138,7 +139,7 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("global");
+    @barrier();
 
     // face 2 & 4
     for(int es=0;es<p_NblockS;++es;@inner(2)){
diff --git a/solvers/advection/okl/advectionSurfaceQuad2D.okl b/solvers/advection/okl/advectionSurfaceQuad2D.okl
index 35e9d2656..e99180fc1 100644
--- a/solvers/advection/okl/advectionSurfaceQuad2D.okl
+++ b/solvers/advection/okl/advectionSurfaceQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -106,8 +106,6 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("local");
-
     // for all face nodes of all elements
     // face 0 & 2
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -123,8 +121,6 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("local");
-
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(1)){
       for(int j=0;j<p_Nq;++j;@inner(0)){
@@ -139,8 +135,6 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("local");
-
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
diff --git a/solvers/advection/okl/advectionSurfaceTet3D.okl b/solvers/advection/okl/advectionSurfaceTet3D.okl
index 6337f48af..e325eb11c 100644
--- a/solvers/advection/okl/advectionSurfaceTet3D.okl
+++ b/solvers/advection/okl/advectionSurfaceTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -101,9 +101,6 @@ SOFTWARE.
       }
     }
 
-    // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
-
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
diff --git a/solvers/advection/okl/advectionSurfaceTri2D.okl b/solvers/advection/okl/advectionSurfaceTri2D.okl
index 67b864f03..7256ffa35 100644
--- a/solvers/advection/okl/advectionSurfaceTri2D.okl
+++ b/solvers/advection/okl/advectionSurfaceTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -100,9 +100,6 @@ SOFTWARE.
       }
     }
 
-    // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
-
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
diff --git a/solvers/advection/okl/advectionVolumeHex3D.okl b/solvers/advection/okl/advectionVolumeHex3D.okl
index 6d342054a..dc920e760 100644
--- a/solvers/advection/okl/advectionVolumeHex3D.okl
+++ b/solvers/advection/okl/advectionVolumeHex3D.okl
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -75,8 +75,6 @@
       }
     }
 
-    @barrier("local");
-
     for(int k=0;k<p_Nq;++k;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
diff --git a/solvers/advection/okl/advectionVolumeQuad2D.okl b/solvers/advection/okl/advectionVolumeQuad2D.okl
index 52006cc92..00338f55b 100644
--- a/solvers/advection/okl/advectionVolumeQuad2D.okl
+++ b/solvers/advection/okl/advectionVolumeQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -64,8 +64,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
-
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
         const dlong gid = e*p_Np*p_Nvgeo+ j*p_Nq +i;
@@ -80,7 +78,7 @@ SOFTWARE.
           rhsqn += Djn*s_G[n][i];
         }
 
-        const dlong id = e*p_Np*p_Nfields + j*p_Nq + i;
+        const dlong id = e*p_Np + j*p_Nq + i;
 
         // move to rhs
         rhsq[id] = invJW*rhsqn;
diff --git a/solvers/advection/okl/advectionVolumeTet3D.okl b/solvers/advection/okl/advectionVolumeTet3D.okl
index 1c0fc38a7..60634dc1a 100644
--- a/solvers/advection/okl/advectionVolumeTet3D.okl
+++ b/solvers/advection/okl/advectionVolumeTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -66,8 +66,6 @@ for(dlong e=0;e<Nelements;++e;@outer(0)){
       s_H[n] = dtdx*cx + dtdy*cy + dtdz*cz;
     }
 
-    @barrier("local");
-
     for(int n=0;n<p_Np;++n;@inner(0)){
 
       dfloat rhsqn = 0;
diff --git a/solvers/advection/okl/advectionVolumeTri2D.okl b/solvers/advection/okl/advectionVolumeTri2D.okl
index 7ccd85b55..2673c732d 100644
--- a/solvers/advection/okl/advectionVolumeTri2D.okl
+++ b/solvers/advection/okl/advectionVolumeTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -57,8 +57,6 @@ SOFTWARE.
       s_G[n] = dsdx*cx + dsdy*cy;
     }
 
-    @barrier("local");
-
     for(int n=0;n<p_Np;++n;@inner(0)){
 
       dfloat rhsqn=0;
diff --git a/solvers/advection/src/advectionPlotFields.cpp b/solvers/advection/src/advectionPlotFields.cpp
index 0d30f4c77..e8fe26be2 100644
--- a/solvers/advection/src/advectionPlotFields.cpp
+++ b/solvers/advection/src/advectionPlotFields.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,11 +27,11 @@ SOFTWARE.
 #include "advection.hpp"
 
 // interpolate data to plot nodes and save to file (one per process
-void advection_t::PlotFields(dfloat* Q, char *fileName){
+void advection_t::PlotFields(memory<dfloat> Q, const std::string fileName){
 
   FILE *fp;
 
-  fp = fopen(fileName, "w");
+  fp = fopen(fileName.c_str(), "w");
 
   fprintf(fp, "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"BigEndian\">\n");
   fprintf(fp, "  <UnstructuredGrid>\n");
@@ -44,30 +44,36 @@ void advection_t::PlotFields(dfloat* Q, char *fileName){
   fprintf(fp, "        <DataArray type=\"Float32\" NumberOfComponents=\"3\" Format=\"ascii\">\n");
 
   //scratch space for interpolation
-  size_t NscratchBytes = mymax(mesh.Np, mesh.plotNp)*sizeof(dfloat);
-  dfloat* scratch = (dfloat *) malloc(2*NscratchBytes);
+  size_t Nscratch = std::max(mesh.Np, mesh.plotNp);
+  memory<dfloat> scratch(2*Nscratch);
 
-  dfloat* Ix = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iy = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iz = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
+  memory<dfloat> Ix(mesh.plotNp);
+  memory<dfloat> Iy(mesh.plotNp);
+  memory<dfloat> Iz(mesh.plotNp);
 
   // compute plot node coordinates on the fly
   for(dlong e=0;e<mesh.Nelements;++e){
     mesh.PlotInterp(mesh.x + e*mesh.Np, Ix, scratch);
     mesh.PlotInterp(mesh.y + e*mesh.Np, Iy, scratch);
-    mesh.PlotInterp(mesh.z + e*mesh.Np, Iz, scratch);
+    if(mesh.dim==3)
+      mesh.PlotInterp(mesh.z + e*mesh.Np, Iz, scratch);
 
-    for(int n=0;n<mesh.plotNp;++n){
-      fprintf(fp, "       ");
-      fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],Iz[n]);
+    if (mesh.dim==2) {
+      for(int n=0;n<mesh.plotNp;++n){
+        fprintf(fp, "       ");
+        fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],0.0);
+      }
+    } else {
+      for(int n=0;n<mesh.plotNp;++n){
+        fprintf(fp, "       ");
+        fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],Iz[n]);
+      }
     }
   }
   fprintf(fp, "        </DataArray>\n");
   fprintf(fp, "      </Points>\n");
 
-  free(Ix); free(Iy); free(Iz);
-
-  dfloat* Ip = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
+  memory<dfloat> Ip(mesh.plotNp);
 
   // write out field
   fprintf(fp, "      <PointData Scalars=\"scalars\">\n");
@@ -83,8 +89,6 @@ void advection_t::PlotFields(dfloat* Q, char *fileName){
   fprintf(fp, "       </DataArray>\n");
   fprintf(fp, "     </PointData>\n");
 
-  free(Ip);
-
   fprintf(fp, "    <Cells>\n");
   fprintf(fp, "      <DataArray type=\"Int32\" Name=\"connectivity\" Format=\"ascii\">\n");
 
@@ -125,6 +129,4 @@ void advection_t::PlotFields(dfloat* Q, char *fileName){
   fprintf(fp, "  </UnstructuredGrid>\n");
   fprintf(fp, "</VTKFile>\n");
   fclose(fp);
-
-  free(scratch);
 }
diff --git a/solvers/advection/src/advectionReport.cpp b/solvers/advection/src/advectionReport.cpp
index 6036f4384..b8026aee0 100644
--- a/solvers/advection/src/advectionReport.cpp
+++ b/solvers/advection/src/advectionReport.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -34,7 +34,7 @@ void advection_t::Report(dfloat time, int tstep){
   mesh.MassMatrixApply(o_q, o_Mq);
 
   dlong Nentries = mesh.Nelements*mesh.Np;
-  dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_q, o_Mq, mesh.comm));
+  dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_q, o_Mq, mesh.comm));
 
   if(mesh.rank==0)
     printf("%5.2f (%d), %5.2f (time, timestep, norm)\n", time, tstep, norm2);
@@ -45,11 +45,11 @@ void advection_t::Report(dfloat time, int tstep){
     o_q.copyTo(q);
 
     // output field files
-    string name;
+    std::string name;
     settings.getSetting("OUTPUT FILE NAME", name);
     char fname[BUFSIZ];
     sprintf(fname, "%s_%04d_%04d.vtu", name.c_str(), mesh.rank, frame++);
 
-    PlotFields(q, fname);
+    PlotFields(q, std::string(fname));
   }
 }
diff --git a/solvers/advection/src/advectionRun.cpp b/solvers/advection/src/advectionRun.cpp
index e0d33d35a..613fd1f53 100644
--- a/solvers/advection/src/advectionRun.cpp
+++ b/solvers/advection/src/advectionRun.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -46,9 +46,9 @@ void advection_t::Run(){
   dfloat vmax = MaxWaveSpeed(o_q, startTime);
 
   dfloat dt = cfl/(vmax*(mesh.N+1.)*(mesh.N+1.));
-  timeStepper->SetTimeStep(dt);
+  timeStepper.SetTimeStep(dt);
 
-  timeStepper->Run(o_q, startTime, finalTime);
+  timeStepper.Run(*this, o_q, startTime, finalTime);
 
   // output norm of final solution
   {
@@ -56,7 +56,7 @@ void advection_t::Run(){
     mesh.MassMatrixApply(o_q, o_Mq);
 
     dlong Nentries = mesh.Nelements*mesh.Np;
-    dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_q, o_Mq, mesh.comm));
+    dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_q, o_Mq, mesh.comm));
 
     if(mesh.rank==0)
       printf("Solution norm = %17.15lg\n", norm2);
diff --git a/solvers/advection/src/advectionSettings.cpp b/solvers/advection/src/advectionSettings.cpp
index f34793954..d633235be 100644
--- a/solvers/advection/src/advectionSettings.cpp
+++ b/solvers/advection/src/advectionSettings.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,7 +27,7 @@ SOFTWARE.
 #include "advection.hpp"
 
 //settings for advection solver
-advectionSettings_t::advectionSettings_t(MPI_Comm& _comm):
+advectionSettings_t::advectionSettings_t(comm_t _comm):
   settings_t(_comm) {
 
   newSetting("DATA FILE",
@@ -66,10 +66,7 @@ advectionSettings_t::advectionSettings_t(MPI_Comm& _comm):
 
 void advectionSettings_t::report() {
 
-  int rank;
-  MPI_Comm_rank(comm, &rank);
-
-  if (rank==0) {
+  if (comm.rank()==0) {
     std::cout << "Advection Settings:\n\n";
     reportSetting("DATA FILE");
     reportSetting("TIME INTEGRATOR");
@@ -83,15 +80,15 @@ void advectionSettings_t::report() {
 
 void advectionSettings_t::parseFromFile(platformSettings_t& platformSettings,
                                   meshSettings_t& meshSettings,
-                                  const string filename) {
+                                  const std::string filename) {
   //read all settings from file
   settings_t s(comm);
   s.readSettingsFromFile(filename);
 
   for(auto it = s.settings.begin(); it != s.settings.end(); ++it) {
-    setting_t* set = it->second;
-    const string name = set->getName();
-    const string val = set->getVal<string>();
+    setting_t& set = it->second;
+    const std::string name = set.getName();
+    const std::string val = set.getVal<std::string>();
     if (platformSettings.hasSetting(name))
       platformSettings.changeSetting(name, val);
     else if (meshSettings.hasSetting(name))
@@ -99,9 +96,7 @@ void advectionSettings_t::parseFromFile(platformSettings_t& platformSettings,
     else if (hasSetting(name)) //self
       changeSetting(name, val);
     else  {
-      stringstream ss;
-      ss << "Unknown setting: [" << name << "] requested";
-      LIBP_ABORT(ss.str());
+      LIBP_FORCE_ABORT("Unknown setting: [" << name << "] requested");
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/advection/src/advectionSetup.cpp b/solvers/advection/src/advectionSetup.cpp
index 918054239..43fb51337 100644
--- a/solvers/advection/src/advectionSetup.cpp
+++ b/solvers/advection/src/advectionSetup.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,112 +26,109 @@ SOFTWARE.
 
 #include "advection.hpp"
 
-advection_t& advection_t::Setup(platform_t& platform, mesh_t& mesh,
-                                 advectionSettings_t& settings){
+void advection_t::Setup(platform_t& _platform, mesh_t& _mesh,
+                         advectionSettings_t& _settings){
 
-  advection_t* advection = new advection_t(platform, mesh, settings);
+  platform = _platform;
+  mesh = _mesh;
+  comm = mesh.comm;
+  settings = _settings;
 
   dlong Nlocal = mesh.Nelements*mesh.Np;
   dlong Nhalo  = mesh.totalHaloPairs*mesh.Np;
 
+  //Trigger JIT kernel builds
+  ogs::InitializeKernels(platform, ogs::Dfloat, ogs::Add);
+
+  //setup linear algebra module
+  platform.linAlg().InitKernels({"innerProd", "max"});
+
+  /*setup trace halo exchange */
+  traceHalo = mesh.HaloTraceSetup(1); //one field
+
   //setup timeStepper
   if (settings.compareSetting("TIME INTEGRATOR","AB3")){
-    advection->timeStepper = new TimeStepper::ab3(mesh.Nelements, mesh.totalHaloPairs,
-                                              mesh.Np, 1, *advection);
+    timeStepper.Setup<TimeStepper::ab3>(mesh.Nelements,
+                                        mesh.totalHaloPairs,
+                                        mesh.Np, 1, platform, comm);
   } else if (settings.compareSetting("TIME INTEGRATOR","LSERK4")){
-    advection->timeStepper = new TimeStepper::lserk4(mesh.Nelements, mesh.totalHaloPairs,
-                                              mesh.Np, 1, *advection);
+    timeStepper.Setup<TimeStepper::lserk4>(mesh.Nelements,
+                                           mesh.totalHaloPairs,
+                                           mesh.Np, 1, platform, comm);
   } else if (settings.compareSetting("TIME INTEGRATOR","DOPRI5")){
-    advection->timeStepper = new TimeStepper::dopri5(mesh.Nelements, mesh.totalHaloPairs,
-                                              mesh.Np, 1, *advection, mesh.comm);
+    timeStepper.Setup<TimeStepper::dopri5>(mesh.Nelements,
+                                           mesh.totalHaloPairs,
+                                           mesh.Np, 1, platform, comm);
   }
 
-  //setup linear algebra module
-  platform.linAlg.InitKernels({"innerProd", "max"});
-
-  /*setup trace halo exchange */
-  advection->traceHalo = mesh.HaloTraceSetup(1); //one field
-
   // compute samples of q at interpolation nodes
-  advection->q = (dfloat*) calloc(Nlocal+Nhalo, sizeof(dfloat));
-  advection->o_q = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), advection->q);
+  q.malloc(Nlocal+Nhalo);
+  o_q = platform.malloc<dfloat>(q);
 
   //storage for M*q during reporting
-  advection->o_Mq = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), advection->q);
+  o_Mq = platform.malloc<dfloat>(q);
   mesh.MassMatrixKernelSetup(1); // mass matrix operator
 
   // OCCA build stuff
-  occa::properties kernelInfo = mesh.props; //copy base occa properties
+  properties_t kernelInfo = mesh.props; //copy base occa properties
 
   //add boundary data to kernel info
-  string dataFileName;
+  std::string dataFileName;
   settings.getSetting("DATA FILE", dataFileName);
   kernelInfo["includes"] += dataFileName;
 
-  kernelInfo["defines/" "p_Nfields"]= 1;
-
-  int maxNodes = mymax(mesh.Np, (mesh.Nfp*mesh.Nfaces));
+  int maxNodes = std::max(mesh.Np, (mesh.Nfp*mesh.Nfaces));
   kernelInfo["defines/" "p_maxNodes"]= maxNodes;
 
   int blockMax = 256;
   if (platform.device.mode() == "CUDA") blockMax = 512;
 
-  int NblockV = mymax(1, blockMax/mesh.Np);
+  int NblockV = std::max(1, blockMax/mesh.Np);
   kernelInfo["defines/" "p_NblockV"]= NblockV;
 
-  int NblockS = mymax(1, blockMax/maxNodes);
+  int NblockS = std::max(1, blockMax/maxNodes);
   kernelInfo["defines/" "p_NblockS"]= NblockS;
 
-  kernelInfo["parser/" "automate-add-barriers"] =  "disabled";
-
   // set kernel name suffix
-  char *suffix;
-  if(mesh.elementType==TRIANGLES)
-    suffix = strdup("Tri2D");
-  if(mesh.elementType==QUADRILATERALS)
-    suffix = strdup("Quad2D");
-  if(mesh.elementType==TETRAHEDRA)
-    suffix = strdup("Tet3D");
-  if(mesh.elementType==HEXAHEDRA)
-    suffix = strdup("Hex3D");
-
-  char fileName[BUFSIZ], kernelName[BUFSIZ];
+  std::string suffix;
+  if(mesh.elementType==Mesh::TRIANGLES)
+    suffix = "Tri2D";
+  if(mesh.elementType==Mesh::QUADRILATERALS)
+    suffix = "Quad2D";
+  if(mesh.elementType==Mesh::TETRAHEDRA)
+    suffix = "Tet3D";
+  if(mesh.elementType==Mesh::HEXAHEDRA)
+    suffix = "Hex3D";
+
+  std::string oklFilePrefix = DADVECTION "/okl/";
+  std::string oklFileSuffix = ".okl";
+
+  std::string fileName, kernelName;
 
   // kernels from volume file
-  sprintf(fileName, DADVECTION "/okl/advectionVolume%s.okl", suffix);
-  sprintf(kernelName, "advectionVolume%s", suffix);
+  fileName   = oklFilePrefix + "advectionVolume" + suffix + oklFileSuffix;
+  kernelName = "advectionVolume" + suffix;
+
+  volumeKernel =  platform.buildKernel(fileName, kernelName, kernelInfo);
 
-  advection->volumeKernel =  platform.buildKernel(fileName, kernelName, kernelInfo);
   // kernels from surface file
-  sprintf(fileName, DADVECTION "/okl/advectionSurface%s.okl", suffix);
-  sprintf(kernelName, "advectionSurface%s", suffix);
+  fileName   = oklFilePrefix + "advectionSurface" + suffix + oklFileSuffix;
+  kernelName = "advectionSurface" + suffix;
 
-  advection->surfaceKernel = platform.buildKernel(fileName, kernelName, kernelInfo);
+  surfaceKernel = platform.buildKernel(fileName, kernelName, kernelInfo);
 
   if (mesh.dim==2) {
-    sprintf(fileName, DADVECTION "/okl/advectionInitialCondition2D.okl");
-    sprintf(kernelName, "advectionInitialCondition2D");
+    fileName   = oklFilePrefix + "advectionInitialCondition2D" + oklFileSuffix;
+    kernelName = "advectionInitialCondition2D";
   } else {
-    sprintf(fileName, DADVECTION "/okl/advectionInitialCondition3D.okl");
-    sprintf(kernelName, "advectionInitialCondition3D");
+    fileName   = oklFilePrefix + "advectionInitialCondition3D" + oklFileSuffix;
+    kernelName = "advectionInitialCondition3D";
   }
 
-  advection->initialConditionKernel = platform.buildKernel(fileName, kernelName, kernelInfo);
-
-  sprintf(fileName, DADVECTION "/okl/advectionMaxWaveSpeed%s.okl", suffix);
-  sprintf(kernelName, "advectionMaxWaveSpeed%s", suffix);
-
-  advection->maxWaveSpeedKernel = platform.buildKernel(fileName, kernelName, kernelInfo);
-
-  return *advection;
-}
+  initialConditionKernel = platform.buildKernel(fileName, kernelName, kernelInfo);
 
-advection_t::~advection_t() {
-  volumeKernel.free();
-  surfaceKernel.free();
-  initialConditionKernel.free();
-  maxWaveSpeedKernel.free();
+  fileName   = oklFilePrefix + "advectionMaxWaveSpeed" + suffix + oklFileSuffix;
+  kernelName = "advectionMaxWaveSpeed" + suffix;
 
-  if (timeStepper) delete timeStepper;
-  if (traceHalo) traceHalo->Free();
+  maxWaveSpeedKernel = platform.buildKernel(fileName, kernelName, kernelInfo);
 }
diff --git a/solvers/advection/src/advectionStep.cpp b/solvers/advection/src/advectionStep.cpp
index 9ccbe7a11..99eb5f2a5 100644
--- a/solvers/advection/src/advectionStep.cpp
+++ b/solvers/advection/src/advectionStep.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,13 +26,13 @@ SOFTWARE.
 
 #include "advection.hpp"
 
-dfloat advection_t::MaxWaveSpeed(occa::memory& o_Q, const dfloat T){
+dfloat advection_t::MaxWaveSpeed(deviceMemory<dfloat>& o_Q, const dfloat T){
 
   //Note: if this is on the critical path in the future, we should pre-allocate this
-  occa::memory o_maxSpeed = platform.malloc(mesh.Nelements*sizeof(dfloat));
+  deviceMemory<dfloat> o_maxSpeed = platform.malloc<dfloat>(mesh.Nelements);
 
   maxWaveSpeedKernel(mesh.Nelements,
-                     mesh.o_vgeo,
+                     mesh.o_wJ,
                      mesh.o_sgeo,
                      mesh.o_vmapM,
                      mesh.o_EToB,
@@ -43,17 +43,16 @@ dfloat advection_t::MaxWaveSpeed(occa::memory& o_Q, const dfloat T){
                      o_Q,
                      o_maxSpeed);
 
-  const dfloat vmax = platform.linAlg.max(mesh.Nelements, o_maxSpeed, mesh.comm);
+  const dfloat vmax = platform.linAlg().max(mesh.Nelements, o_maxSpeed, mesh.comm);
 
-  o_maxSpeed.free();
   return vmax;
 }
 
 //evaluate ODE rhs = f(q,t)
-void advection_t::rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){
+void advection_t::rhsf(deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS, const dfloat T){
 
   // extract q halo on DEVICE
-  traceHalo->ExchangeStart(o_Q, 1, ogs_dfloat);
+  traceHalo.ExchangeStart(o_Q, 1);
 
   volumeKernel(mesh.Nelements,
                mesh.o_vgeo,
@@ -65,7 +64,7 @@ void advection_t::rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){
                o_Q,
                o_RHS);
 
-  traceHalo->ExchangeFinish(o_Q, 1, ogs_dfloat);
+  traceHalo.ExchangeFinish(o_Q, 1);
 
   surfaceKernel(mesh.Nelements,
                 mesh.o_sgeo,
diff --git a/solvers/bns/bns.hpp b/solvers/bns/bns.hpp
index d47f3d8e0..125aec38f 100644
--- a/solvers/bns/bns.hpp
+++ b/solvers/bns/bns.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -36,33 +36,35 @@ SOFTWARE.
 
 #define DBNS LIBP_DIR"/solvers/bns/"
 
+using namespace libp;
+
 class bnsSettings_t: public settings_t {
 public:
-  bnsSettings_t(MPI_Comm& _comm);
+  bnsSettings_t(comm_t& _comm);
   void report();
   void parseFromFile(platformSettings_t& platformSettings,
                      meshSettings_t& meshSettings,
-                     const string filename);
+                     const std::string filename);
 };
 
 class bns_t: public solver_t {
 public:
-  mesh_t& mesh;
+  mesh_t mesh;
 
   int Nfields;
   int Npmlfields;
 
-  TimeStepper::timeStepper_t* timeStepper;
+  timeStepper_t timeStepper;
 
-  halo_t* traceHalo;
-  halo_t** multirateTraceHalo;
+  ogs::halo_t traceHalo;
+  memory<ogs::halo_t> multirateTraceHalo;
 
   dfloat RT, c, tauInv, Ma, Re, nu; // Flow parameters
 
   // Pml
   int pmlOrder;
   dfloat  sigmaXmax, sigmaYmax, sigmaZmax;
-  dfloat *pmlSigma;
+  memory<dfloat> pmlSigma;
   dfloat pmlAlpha;
 
   // Flag for using cubature integration for sigma terms in pml
@@ -71,38 +73,37 @@ class bns_t: public solver_t {
   // Flag for semi-analytic timestepping
   int semiAnalytic;
 
-  dfloat *q;
-  occa::memory o_q;
+  memory<dfloat> q;
+  deviceMemory<dfloat> o_q;
 
-  occa::memory o_Mq;
+  deviceMemory<dfloat> o_Mq;
 
-  dfloat *Vort, *VortMag;
-  occa::memory o_Vort, o_VortMag;
+  memory<dfloat> Vort, VortMag;
+  deviceMemory<dfloat> o_Vort, o_VortMag;
 
-  occa::memory o_pmlSigma;
+  deviceMemory<dfloat> o_pmlSigma;
 
-  occa::kernel volumeKernel;
-  occa::kernel surfaceKernel;
-  occa::kernel relaxationKernel;
+  kernel_t volumeKernel;
+  kernel_t surfaceKernel;
+  kernel_t relaxationKernel;
 
-  occa::kernel pmlVolumeKernel;
-  occa::kernel pmlSurfaceKernel;
-  occa::kernel pmlRelaxationKernel;
+  kernel_t pmlVolumeKernel;
+  kernel_t pmlSurfaceKernel;
+  kernel_t pmlRelaxationKernel;
 
-  occa::kernel vorticityKernel;
+  kernel_t vorticityKernel;
 
-  occa::kernel initialConditionKernel;
+  kernel_t initialConditionKernel;
 
-  bns_t() = delete;
+  bns_t() = default;
   bns_t(platform_t &_platform, mesh_t &_mesh,
-              bnsSettings_t& _settings):
-    solver_t(_platform, _settings), mesh(_mesh) {}
-
-  ~bns_t();
+              bnsSettings_t& _settings) {
+    Setup(_platform, _mesh, _settings);
+  }
 
   //setup
-  static bns_t& Setup(platform_t& platform, mesh_t& mesh,
-                      bnsSettings_t& settings);
+  void Setup(platform_t& _platform, mesh_t& _mesh,
+             bnsSettings_t& _settings);
 
   void PmlSetup();
 
@@ -110,40 +111,40 @@ class bns_t: public solver_t {
 
   void Report(dfloat time, int tstep);
 
-  void PlotFields(dfloat* Q, dfloat* V, char *fileName);
+  void PlotFields(memory<dfloat>& Q, memory<dfloat>& V, std::string fileName);
 
   dfloat MaxWaveSpeed();
 
-  void rhsf_pml(occa::memory& o_Q, occa::memory& o_pmlQ,
-                occa::memory& o_RHS, occa::memory& o_pmlRHS, const dfloat T);
+  void rhsf_pml(deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_pmlQ,
+                deviceMemory<dfloat>& o_RHS, deviceMemory<dfloat>& o_pmlRHS, const dfloat T);
 
-  void rhsf_MR_pml(occa::memory& o_Q, occa::memory& o_pmlQ,
-                   occa::memory& o_RHS, occa::memory& o_pmlRHS,
-                   occa::memory& o_fQM, const dfloat T, const int lev);
+  void rhsf_MR_pml(deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_pmlQ,
+                   deviceMemory<dfloat>& o_RHS, deviceMemory<dfloat>& o_pmlRHS,
+                   deviceMemory<dfloat>& o_fQM, const dfloat T, const int lev);
 
   //seperate components of rhs evaluation
-  void rhsVolume(dlong N, occa::memory& o_ids,
-                 occa::memory& o_Q, occa::memory& o_RHS, const dfloat T);
-  void rhsPmlVolume(dlong N, occa::memory& o_ids, occa::memory& o_pmlids,
-                    occa::memory& o_Q, occa::memory& o_pmlQ,
-                    occa::memory& o_RHS, occa::memory& o_pmlRHS, const dfloat T);
-  void rhsRelaxation(dlong N, occa::memory& o_ids,
-                     occa::memory& o_Q, occa::memory& o_RHS);
-  void rhsPmlRelaxation(dlong N, occa::memory& o_ids, occa::memory& o_pmlids,
-                        occa::memory& o_Q, occa::memory& o_pmlQ,
-                        occa::memory& o_RHS, occa::memory& o_pmlRHS);
-  void rhsSurface(dlong N, occa::memory& o_ids,
-                  occa::memory& o_Q, occa::memory& o_RHS, const dfloat T);
-  void rhsPmlSurface(dlong N, occa::memory& o_ids, occa::memory& o_pmlids,
-                     occa::memory& o_Q, occa::memory& o_pmlQ,
-                     occa::memory& o_RHS, occa::memory& o_pmlRHS, const dfloat T);
-  void rhsSurfaceMR(dlong N, occa::memory& o_ids,
-                    occa::memory& o_Q, occa::memory& o_RHS,
-                    occa::memory& o_fQM, const dfloat T);
-  void rhsPmlSurfaceMR(dlong N, occa::memory& o_ids, occa::memory& o_pmlids,
-                       occa::memory& o_Q, occa::memory& o_pmlQ,
-                       occa::memory& o_RHS, occa::memory& o_pmlRHS,
-                       occa::memory& o_fQM, const dfloat T);
+  void rhsVolume(dlong N, deviceMemory<dlong>& o_ids,
+                 deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS, const dfloat T);
+  void rhsPmlVolume(dlong N, deviceMemory<dlong>& o_ids, deviceMemory<dlong>& o_pmlids,
+                    deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_pmlQ,
+                    deviceMemory<dfloat>& o_RHS, deviceMemory<dfloat>& o_pmlRHS, const dfloat T);
+  void rhsRelaxation(dlong N, deviceMemory<dlong>& o_ids,
+                     deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS);
+  void rhsPmlRelaxation(dlong N, deviceMemory<dlong>& o_ids, deviceMemory<dlong>& o_pmlids,
+                        deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_pmlQ,
+                        deviceMemory<dfloat>& o_RHS, deviceMemory<dfloat>& o_pmlRHS);
+  void rhsSurface(dlong N, deviceMemory<dlong>& o_ids,
+                  deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS, const dfloat T);
+  void rhsPmlSurface(dlong N, deviceMemory<dlong>& o_ids, deviceMemory<dlong>& o_pmlids,
+                     deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_pmlQ,
+                     deviceMemory<dfloat>& o_RHS, deviceMemory<dfloat>& o_pmlRHS, const dfloat T);
+  void rhsSurfaceMR(dlong N, deviceMemory<dlong>& o_ids,
+                    deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS,
+                    deviceMemory<dfloat>& o_fQM, const dfloat T);
+  void rhsPmlSurfaceMR(dlong N, deviceMemory<dlong>& o_ids, deviceMemory<dlong>& o_pmlids,
+                       deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_pmlQ,
+                       deviceMemory<dfloat>& o_RHS, deviceMemory<dfloat>& o_pmlRHS,
+                       deviceMemory<dfloat>& o_fQM, const dfloat T);
 };
 #endif
 
@@ -154,7 +155,7 @@ class bns_t: public solver_t {
   dfloat isoMinVal, isoMaxVal, *isoLevels, *isoq;
   size_t isoMax;
 
-  occa::memory o_isoLevels, o_isoq, o_isoNtris;
+  deviceMemory<dfloat> o_isoLevels, o_isoq, o_isoNtris;
 
   // MRSAAB Coefficients
   dfloat *MRSAAB_A, *MRSAAB_B, *MRSAAB_C, *MRAB_A, *MRAB_B, *MRAB_C;
@@ -164,7 +165,7 @@ class bns_t: public solver_t {
   int *isoGNlevels, isoGNgroups;
   dfloat **isoGLvalues;
 
-  occa::memory *o_isoGLvalues;
+  deviceMemory<dfloat> *o_isoGLvalues;
 
   // NBN: add storage for compacted isosurf data for gmsh write
   std::vector<dfloat> iso_nodes;
diff --git a/solvers/bns/bnsMain.cpp b/solvers/bns/bnsMain.cpp
index e2a16a0a7..e7e0e8c98 100644
--- a/solvers/bns/bnsMain.cpp
+++ b/solvers/bns/bnsMain.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -29,39 +29,40 @@ SOFTWARE.
 int main(int argc, char **argv){
 
   // start up MPI
-  MPI_Init(&argc, &argv);
+  Comm::Init(argc, argv);
 
-  MPI_Comm comm = MPI_COMM_WORLD;
+  LIBP_ABORT("Usage: ./bnsMain setupfile", argc!=2);
 
-  if(argc!=2)
-    LIBP_ABORT(string("Usage: ./bnsMain setupfile"));
+  { /*Scope so everything is destructed before MPI_Finalize */
+    comm_t comm(Comm::World().Dup());
 
-  //create default settings
-  platformSettings_t platformSettings(comm);
-  meshSettings_t meshSettings(comm);
-  bnsSettings_t bnsSettings(comm);
+    //create default settings
+    platformSettings_t platformSettings(comm);
+    meshSettings_t meshSettings(comm);
+    bnsSettings_t bnsSettings(comm);
 
-  //load settings from file
-  bnsSettings.parseFromFile(platformSettings, meshSettings,
-                            argv[1]);
+    //load settings from file
+    bnsSettings.parseFromFile(platformSettings, meshSettings,
+                              argv[1]);
 
-  // set up platform
-  platform_t platform(platformSettings);
+    // set up platform
+    platform_t platform(platformSettings);
 
-  platformSettings.report();
-  meshSettings.report();
-  bnsSettings.report();
+    platformSettings.report();
+    meshSettings.report();
+    bnsSettings.report();
 
-  // set up mesh
-  mesh_t& mesh = mesh_t::Setup(platform, meshSettings, comm);
+    // set up mesh
+    mesh_t mesh(platform, meshSettings, comm);
 
-  // set up bns solver
-  bns_t& bns = bns_t::Setup(platform, mesh, bnsSettings);
+    // set up bns solver
+    bns_t bns(platform, mesh, bnsSettings);
 
-  // run
-  bns.Run();
+    // run
+    bns.Run();
+  }
 
   // close down MPI
-  MPI_Finalize();
+  Comm::Finalize();
   return LIBP_SUCCESS;
 }
diff --git a/solvers/bns/data/bnsGaussian2D.h b/solvers/bns/data/bnsGaussian2D.h
index 70481a23d..0236e56d2 100644
--- a/solvers/bns/data/bnsGaussian2D.h
+++ b/solvers/bns/data/bnsGaussian2D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/bns/data/bnsGaussian3D.h b/solvers/bns/data/bnsGaussian3D.h
index 845468795..3dd35cdca 100644
--- a/solvers/bns/data/bnsGaussian3D.h
+++ b/solvers/bns/data/bnsGaussian3D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/bns/data/bnsUniform2D.h b/solvers/bns/data/bnsUniform2D.h
index 22e554f25..7e72719ce 100644
--- a/solvers/bns/data/bnsUniform2D.h
+++ b/solvers/bns/data/bnsUniform2D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/bns/data/bnsUniform3D.h b/solvers/bns/data/bnsUniform3D.h
index 416edc37f..3c5d8fb17 100644
--- a/solvers/bns/data/bnsUniform3D.h
+++ b/solvers/bns/data/bnsUniform3D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/bns/makefile b/solvers/bns/makefile
index d0758aa94..1d393db9e 100644
--- a/solvers/bns/makefile
+++ b/solvers/bns/makefile
@@ -2,7 +2,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
@@ -77,11 +77,8 @@ include ../../make.top
 endif
 endif
 
-#gslib
-GS_DIR=${LIBP_TPL_DIR}/gslib
-
 #libraries
-BNS_LIBP_LIBS=timeStepper mesh ogs linAlg core
+BNS_LIBP_LIBS=timeStepper mesh parAdogs ogs linAlg core
 
 #includes
 INCLUDES=${LIBP_INCLUDES} \
@@ -91,11 +88,10 @@ DEFINES =${LIBP_DEFINES} \
          -DLIBP_DIR='"${LIBP_DIR}"'
 
 #.cpp compilation flags
-BNS_CXXFLAGS=${LIBP_MPICXXFLAGS} ${DEFINES} ${INCLUDES}
+BNS_CXXFLAGS=${LIBP_CXXFLAGS} ${DEFINES} ${INCLUDES}
 
 #link libraries
 LIBS=-L${LIBP_LIBS_DIR} $(addprefix -l,$(BNS_LIBP_LIBS)) \
-     -L$(GS_DIR)/lib -lgs \
      ${LIBP_LIBS}
 
 #link flags
@@ -143,10 +139,10 @@ endif
 # rule for .cpp files
 %.o: %.cpp $(DEPS) | libp_libs
 ifneq (,${verbose})
-	$(LIBP_MPICXX) -o $*.o -c $*.cpp $(BNS_CXXFLAGS)
+	$(LIBP_CXX) -o $*.o -c $*.cpp $(BNS_CXXFLAGS)
 else
 	@printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n";
-	@$(LIBP_MPICXX) -o $*.o -c $*.cpp $(BNS_CXXFLAGS)
+	@$(LIBP_CXX) -o $*.o -c $*.cpp $(BNS_CXXFLAGS)
 endif
 
 #cleanup
@@ -157,8 +153,7 @@ clean-libs: clean
 	${MAKE} -C ${LIBP_LIBS_DIR} clean
 
 clean-kernels: clean-libs
-# 	$(shell ${OCCA_DIR}/bin/occa clear all -y)
-	rm -rf ~/.occa/
+	rm -rf ${LIBP_DIR}/.occa/
 
 realclean: clean
 	${MAKE} -C ${LIBP_LIBS_DIR} realclean
diff --git a/solvers/bns/okl/bnsConstrainQuad3D.okl b/solvers/bns/okl/bnsConstrainQuad3D.okl
index 2f46eb8e4..69d26598b 100644
--- a/solvers/bns/okl/bnsConstrainQuad3D.okl
+++ b/solvers/bns/okl/bnsConstrainQuad3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,34 +26,34 @@ SOFTWARE.
 
 
 @kernel void bnsConstrainQuad3D(const dlong Nelements,
-				@restrict const  dfloat *  x,
-				@restrict const  dfloat *  y,
-				@restrict const  dfloat *  z,
-				@restrict dfloat *  rhsq){
+                                @restrict const  dfloat *  x,
+                                @restrict const  dfloat *  y,
+                                @restrict const  dfloat *  z,
+                                @restrict dfloat *  rhsq){
 
   for(dlong e=0;e<Nelements;++e;@outer(0)){
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
-	
+
         const dlong  base = e*p_Np*p_Nfields + j*p_Nq + i;
 
-	const dfloat xij = x[i+j*p_Nq+e*p_Np];
-	const dfloat yij = y[i+j*p_Nq+e*p_Np];
-	const dfloat zij = z[i+j*p_Nq+e*p_Np];
-
-	dfloat rhsq1 = rhsq[base+1*p_Np];
-	dfloat rhsq2 = rhsq[base+2*p_Np];
-	dfloat rhsq3 = rhsq[base+3*p_Np];
-	
-	const dfloat xdotrhsq = (rhsq1*xij + rhsq2*yij + rhsq3*zij)*p_invRadiusSq;
-	rhsq1 -= xij*xdotrhsq;
-	rhsq2 -= yij*xdotrhsq;
-	rhsq3 -= zij*xdotrhsq;
-	
-	rhsq[base+1*p_Np] = rhsq1;
-	rhsq[base+2*p_Np] = rhsq2;
-	rhsq[base+3*p_Np] = rhsq3;
+        const dfloat xij = x[i+j*p_Nq+e*p_Np];
+        const dfloat yij = y[i+j*p_Nq+e*p_Np];
+        const dfloat zij = z[i+j*p_Nq+e*p_Np];
+
+        dfloat rhsq1 = rhsq[base+1*p_Np];
+        dfloat rhsq2 = rhsq[base+2*p_Np];
+        dfloat rhsq3 = rhsq[base+3*p_Np];
+
+        const dfloat xdotrhsq = (rhsq1*xij + rhsq2*yij + rhsq3*zij)*p_invRadiusSq;
+        rhsq1 -= xij*xdotrhsq;
+        rhsq2 -= yij*xdotrhsq;
+        rhsq3 -= zij*xdotrhsq;
+
+        rhsq[base+1*p_Np] = rhsq1;
+        rhsq[base+2*p_Np] = rhsq2;
+        rhsq[base+3*p_Np] = rhsq3;
       }
     }
   }
diff --git a/solvers/bns/okl/bnsInitialCondition2D.okl b/solvers/bns/okl/bnsInitialCondition2D.okl
index 7cfc61c71..14b7c5977 100644
--- a/solvers/bns/okl/bnsInitialCondition2D.okl
+++ b/solvers/bns/okl/bnsInitialCondition2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/bns/okl/bnsInitialCondition3D.okl b/solvers/bns/okl/bnsInitialCondition3D.okl
index 864ee6443..952adefbc 100644
--- a/solvers/bns/okl/bnsInitialCondition3D.okl
+++ b/solvers/bns/okl/bnsInitialCondition3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/bns/okl/bnsIsoSurface3D.okl b/solvers/bns/okl/bnsIsoSurface3D.okl
index a6ef1c2e3..de23063ce 100644
--- a/solvers/bns/okl/bnsIsoSurface3D.okl
+++ b/solvers/bns/okl/bnsIsoSurface3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -304,7 +304,6 @@ int marchingTet(const int fld,
       }
     }
 
-    @barrier("local");
     
     for(int n=0;n<p_plotNthreads;++n;@inner(0)){
       if(n<p_plotNp){
@@ -332,7 +331,6 @@ int marchingTet(const int fld,
     }
     
     
-    @barrier("local");
 
     for(int n=0;n<p_plotNthreads;++n;@inner(0)){
 
diff --git a/solvers/bns/okl/bnsRelaxationHex3D.okl b/solvers/bns/okl/bnsRelaxationHex3D.okl
index 198826853..a59cfa873 100644
--- a/solvers/bns/okl/bnsRelaxationHex3D.okl
+++ b/solvers/bns/okl/bnsRelaxationHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -74,7 +74,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //interpolate in i, store in register
     for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -98,7 +97,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -113,7 +111,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
 
     //interpolate in j, store in register
@@ -138,7 +135,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -153,7 +149,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //interpolate in k, store in register
     for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -175,7 +170,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //construct nonlinear term from registers
     for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -206,7 +200,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //project in k
     for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -229,7 +222,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -244,7 +236,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //project in j
     for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -267,7 +258,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -282,7 +272,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // project in i and update
     for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -379,7 +368,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for (int d=0;d<4;d++) { //loop over fields q, qx, qy, and qz
 
@@ -397,7 +385,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       //interpolate in i, store in register
       for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -423,7 +410,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       //write register back to @shared
       for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -438,7 +424,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       //interpolate in j, store in register
       for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -463,7 +448,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       //write register back to @shared
       for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -478,7 +462,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       //interpolate in k, store in register
       for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -501,7 +484,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
     }
 
     //construct nonlinear term from registers
@@ -551,7 +533,6 @@ SOFTWARE.
     }
 
 
-    @barrier("local");
 
     for (int d=0;d<4;d++) { //loop over fields rhsq, rhsqx, rhsqy, and rhsqz
 
@@ -589,7 +570,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       //write register back to @shared
       for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -626,7 +606,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       //write register back to @shared
       for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -641,7 +620,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       // project in i
       for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -667,7 +645,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
     }
 
     // update
@@ -763,7 +740,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //interpolate in i, store in register
     for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -794,7 +770,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -812,7 +787,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
 
     //interpolate in j, store in register
@@ -844,7 +818,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -862,7 +835,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //interpolate in k, store in register
     for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -891,7 +863,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //construct nonlinear term from registers
     for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -940,7 +911,6 @@ SOFTWARE.
     }
 
 
-    @barrier("local");
 
     //project in k
     for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -970,7 +940,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -1016,7 +985,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int k=0;k<p_cubNq;++k;@inner(2)){
@@ -1034,7 +1002,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // project in i and update
     for(int k=0;k<p_cubNq;++k;@inner(2)){
diff --git a/solvers/bns/okl/bnsRelaxationQuad2D.okl b/solvers/bns/okl/bnsRelaxationQuad2D.okl
index e198661b4..681f6cb16 100644
--- a/solvers/bns/okl/bnsRelaxationQuad2D.okl
+++ b/solvers/bns/okl/bnsRelaxationQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -77,7 +77,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //interpolate in i, store in register
     for(int es=0;es<p_NblockCub;++es;@inner(2)){
@@ -101,7 +100,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int es=0;es<p_NblockCub;++es;@inner(2)){
@@ -116,7 +114,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
 
     //interpolate in j, store in register
@@ -139,7 +136,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //construct nonlinear term from registers
     for(int es=0;es<p_NblockCub;++es;@inner(2)){
@@ -167,7 +163,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //project in j
     for(int es=0;es<p_NblockCub;++es;@inner(2)){
@@ -190,7 +185,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int es=0;es<p_NblockCub;++es;@inner(2)){
@@ -205,7 +199,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // project in i and update
     for(int es=0;es<p_NblockCub;++es;@inner(2)){
@@ -303,7 +296,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //interpolate in i, store in register
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -330,7 +322,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -345,7 +336,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
      //interpolate in j, store in register
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -369,7 +359,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //construct nonlinear term from registers
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -407,7 +396,6 @@ SOFTWARE.
     }
 
 
-    @barrier("local");
 
     //project in j
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -431,7 +419,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -446,7 +433,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // project in i and update
     for(int j=0;j<p_cubNq;++j;@inner(1)){
diff --git a/solvers/bns/okl/bnsRelaxationQuad3D.okl b/solvers/bns/okl/bnsRelaxationQuad3D.okl
index d8ee5bdab..f5656d65a 100644
--- a/solvers/bns/okl/bnsRelaxationQuad3D.okl
+++ b/solvers/bns/okl/bnsRelaxationQuad3D.okl
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -31,109 +31,109 @@
 
 // nodal version
 @kernel void bnsRelaxationQuad3D(const dlong Nelements,
-				 @restrict const  dlong *  elementIds,
-				 @restrict const  dfloat *  vgeo,
-				 @restrict const  dfloat *  cubvgeo,
-				 const dlong offset,
-				 const int   shift,
-				 @restrict const  dfloat *  cubInterpT,
-				 @restrict const  dfloat *  cubProjectT,
-				 @restrict const  dfloat *  q,
-				 @restrict dfloat *  rhsq){
-  
+                                 @restrict const  dlong *  elementIds,
+                                 @restrict const  dfloat *  vgeo,
+                                 @restrict const  dfloat *  cubvgeo,
+                                 const dlong offset,
+                                 const int   shift,
+                                 @restrict const  dfloat *  cubInterpT,
+                                 @restrict const  dfloat *  cubProjectT,
+                                 @restrict const  dfloat *  q,
+                                 @restrict dfloat *  rhsq){
+
   for(dlong eo=0;eo<Nelements;eo+=p_NblockV;@outer(0)){  // for all elements
-    
+
     // @shared memory for q at nodes
     @exclusive dfloat r_q[p_Nfields];
-    @exclusive dlong e; 
-    
+    @exclusive dlong e;
+
     //fetch the U and V values and store in flux arrays in @shared
-    for(int es=0;es<p_NblockV;++es;@inner(2)){   
-      for(int j=0;j<p_Nq;++j;@inner(1)){ 
-        for(int i=0;i<p_Nq;++i;@inner(0)){    
+    for(int es=0;es<p_NblockV;++es;@inner(2)){
+      for(int j=0;j<p_Nq;++j;@inner(1)){
+        for(int i=0;i<p_Nq;++i;@inner(0)){
           dlong et = eo+es; // element in block
           if (et<Nelements) {
             e = elementIds[et];
 
-	    const dlong id = e*p_Nfields*p_Np + j*p_Nq + i;
-	    
+            const dlong id = e*p_Nfields*p_Np + j*p_Nq + i;
+
 #pragma unroll p_Nfields
-	    for(int fld=0; fld<p_Nfields;++fld){
-	      r_q[fld] = q[id+fld*p_Np];
-	    }
-	    
+            for(int fld=0; fld<p_Nfields;++fld){
+              r_q[fld] = q[id+fld*p_Np];
+            }
+
             dlong rhsId = e*p_Np*p_Nfields + j*p_Nq + i;
-	    // 
-	    if(p_MRSAAB)
-	      rhsId     += shift*offset;
-	    
-	    // BGK relaxation approximation to the Boltzmann collision operator for N5 - N10
-	    // [ for LSERK the linear term is added below ]
-	    dfloat rinv = 1.f/r_q[0];
-	    dfloat tmp4 = (p_invsqrt2*r_q[1]*r_q[1]*rinv);
-	    dfloat tmp5 = (p_invsqrt2*r_q[2]*r_q[2]*rinv);
-	    dfloat tmp6 = (p_invsqrt2*r_q[3]*r_q[3]*rinv);
-	    dfloat tmp7 = r_q[1]*r_q[2]*rinv;
-	    dfloat tmp8 = r_q[1]*r_q[3]*rinv;
-	    dfloat tmp9 = r_q[2]*r_q[3]*rinv;
-	    
-	    if(!p_SEMI_ANALYTIC){
-	      tmp4 -= r_q[4];
-	      tmp5 -= r_q[5];
-	      tmp6 -= r_q[6];
-	      tmp7 -= r_q[7];
-	      tmp8 -= r_q[8];
-	      tmp9 -= r_q[9];
-	    }
-
-	    rhsq[rhsId + 4*p_Np] += p_tauInv*tmp4;
-	    rhsq[rhsId + 5*p_Np] += p_tauInv*tmp5;
-	    rhsq[rhsId + 6*p_Np] += p_tauInv*tmp6;
-	    rhsq[rhsId + 7*p_Np] += p_tauInv*tmp7;
-	    rhsq[rhsId + 8*p_Np] += p_tauInv*tmp8;
-	    rhsq[rhsId + 9*p_Np] += p_tauInv*tmp9;	    
-	  }
-	}
+            //
+            if(p_MRSAAB)
+              rhsId     += shift*offset;
+
+            // BGK relaxation approximation to the Boltzmann collision operator for N5 - N10
+            // [ for LSERK the linear term is added below ]
+            dfloat rinv = 1.f/r_q[0];
+            dfloat tmp4 = (p_invsqrt2*r_q[1]*r_q[1]*rinv);
+            dfloat tmp5 = (p_invsqrt2*r_q[2]*r_q[2]*rinv);
+            dfloat tmp6 = (p_invsqrt2*r_q[3]*r_q[3]*rinv);
+            dfloat tmp7 = r_q[1]*r_q[2]*rinv;
+            dfloat tmp8 = r_q[1]*r_q[3]*rinv;
+            dfloat tmp9 = r_q[2]*r_q[3]*rinv;
+
+            if(!p_SEMI_ANALYTIC){
+              tmp4 -= r_q[4];
+              tmp5 -= r_q[5];
+              tmp6 -= r_q[6];
+              tmp7 -= r_q[7];
+              tmp8 -= r_q[8];
+              tmp9 -= r_q[9];
+            }
+
+            rhsq[rhsId + 4*p_Np] += p_tauInv*tmp4;
+            rhsq[rhsId + 5*p_Np] += p_tauInv*tmp5;
+            rhsq[rhsId + 6*p_Np] += p_tauInv*tmp6;
+            rhsq[rhsId + 7*p_Np] += p_tauInv*tmp7;
+            rhsq[rhsId + 8*p_Np] += p_tauInv*tmp8;
+            rhsq[rhsId + 9*p_Np] += p_tauInv*tmp9;
+          }
+        }
       }
     }
   }
-} 
+}
 
 #else
-// cubature version 
+// cubature version
 @kernel void bnsRelaxationQuad3D(const dlong Nelements,
-				 @restrict const  dlong *  elementIds,
-				 @restrict const  dfloat *  vgeo,
-				 @restrict const  dfloat *  cubvgeo,
-				 const dlong offset,
-				 const int   shift,
-				 @restrict const  dfloat *  cubInterpT,
-				 @restrict const  dfloat *  cubProjectT,
-				 @restrict const  dfloat *  q,
-				 @restrict dfloat *  rhsq){
-  
+                                 @restrict const  dlong *  elementIds,
+                                 @restrict const  dfloat *  vgeo,
+                                 @restrict const  dfloat *  cubvgeo,
+                                 const dlong offset,
+                                 const int   shift,
+                                 @restrict const  dfloat *  cubInterpT,
+                                 @restrict const  dfloat *  cubProjectT,
+                                 @restrict const  dfloat *  q,
+                                 @restrict dfloat *  rhsq){
+
   for(dlong eo=0;eo<Nelements;eo+=p_NblockV;@outer(0)){  // for all elements
 
     // @shared memory for q at nodes
     @shared dfloat s_q[p_NblockV][p_Nfields][p_cubNq][p_cubNq];
 
     @exclusive dfloat r_q[p_Nfields];
-    
+
     @shared dfloat s_cubInterpT[p_Nq][p_cubNq];
     @shared dfloat s_cubProjectT[p_cubNq][p_Nq];
-    
-    @exclusive dlong e; 
-    
+
+    @exclusive dlong e;
+
     //fetch the U and V values and store in flux arrays in @shared
-    for(int es=0;es<p_NblockV;++es;@inner(2)){   
-      for(int j=0;j<p_cubNq;++j;@inner(1)){ 
-        for(int i=0;i<p_cubNq;++i;@inner(0)){    
+    for(int es=0;es<p_NblockV;++es;@inner(2)){
+      for(int j=0;j<p_cubNq;++j;@inner(1)){
+        for(int i=0;i<p_cubNq;++i;@inner(0)){
           dlong et = eo+es; // element in block
           if (et<Nelements) {
             e = elementIds[et];
-            if ((i<p_Nq) && (j<p_Nq)){ 
+            if ((i<p_Nq) && (j<p_Nq)){
               const dlong id = e*p_Nfields*p_Np + j*p_Nq + i;
-              
+
 #pragma unroll p_Nfields
               for(int fld=0; fld<p_Nfields;++fld){
                 s_q[es][fld][j][i] = q[id+fld*p_Np];
@@ -149,25 +149,24 @@
         }
       }
     }
-      
-    @barrier("local");
+
 
     //interpolate in i, store in register
-    for(int es=0;es<p_NblockV;++es;@inner(2)){   
-      for(int j=0;j<p_cubNq;++j;@inner(1)){ 
+    for(int es=0;es<p_NblockV;++es;@inner(2)){
+      for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
           if (j<p_Nq) {
 
             for(int fld=0; fld<p_Nfields; fld++)
-              r_q[fld] = 0.f; 
-            
+              r_q[fld] = 0.f;
+
 
 #pragma unroll p_Nq
             for (int n=0;n<p_Nq;n++) {
               const dfloat Ini = s_cubInterpT[n][i];
 #pragma unroll p_Nfields
               for(int fld=0; fld<p_Nfields; fld++){
-                r_q[fld]  += Ini*s_q[es][fld][j][n];                
+                r_q[fld]  += Ini*s_q[es][fld][j][n];
               }
             }
           }
@@ -175,88 +174,84 @@
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
-    for(int es=0;es<p_NblockV;++es;@inner(2)){   
-      for(int j=0;j<p_cubNq;++j;@inner(1)){ 
+    for(int es=0;es<p_NblockV;++es;@inner(2)){
+      for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
           if (j<p_Nq) {
             for(int fld=0; fld<p_Nfields; fld++){
-              s_q[es][fld][j][i] = r_q[fld];              
+              s_q[es][fld][j][i] = r_q[fld];
             }
           }
         }
       }
     }
 
-    @barrier("local");
 
     //interpolate in j, store in register
-    for(int es=0;es<p_NblockV;++es;@inner(2)){   
-      for(int j=0;j<p_cubNq;++j;@inner(1)){ 
+    for(int es=0;es<p_NblockV;++es;@inner(2)){
+      for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
-         
+
           for(int fld=0; fld<p_Nfields; fld++)
-            r_q[fld] = 0.f; 
-  
+            r_q[fld] = 0.f;
+
 #pragma unroll p_Nq
           for (int n=0;n<p_Nq;n++) {
             const dfloat Inj = s_cubInterpT[n][j];
             // #pragma unroll p_Nfields
             for(int fld=0; fld<p_Nfields; fld++){
-              r_q[fld]  += Inj*s_q[es][fld][n][i];                
+              r_q[fld]  += Inj*s_q[es][fld][n][i];
             }
-          }   
+          }
         }
       }
     }
 
-    @barrier("local");
-  
+
     //interpolate in j, store in register
-    for(int es=0;es<p_NblockV;++es;@inner(2)){   
-      for(int j=0;j<p_cubNq;++j;@inner(1)){ 
+    for(int es=0;es<p_NblockV;++es;@inner(2)){
+      for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
-	  dlong et = eo+es; // element in block
-	  if (et<Nelements) {
-	    const dlong gid = e*p_cubNp*p_Nvgeo+ j*p_cubNq +i;
-	    const dfloat J = cubvgeo[gid + p_JID*p_cubNp];
-	    
-	    const dfloat icubq1 = 1.f/r_q[0];
-	    // BGK relaxation approximation to the Boltzmann collision operator for N5 - N10
-	    // [ for LSERK the linear term is added below ]
-	    s_q[es][4][j][i]  = J*p_tauInv*(p_invsqrt2*r_q[1]*r_q[1]*icubq1);
-	    s_q[es][5][j][i]  = J*p_tauInv*(p_invsqrt2*r_q[2]*r_q[2]*icubq1);
-	    s_q[es][6][j][i]  = J*p_tauInv*(p_invsqrt2*r_q[3]*r_q[3]*icubq1);
-
-	    s_q[es][7][j][i]  = J*p_tauInv*(r_q[1]*r_q[2]*icubq1);
-	    s_q[es][8][j][i]  = J*p_tauInv*(r_q[1]*r_q[3]*icubq1);
-	    s_q[es][9][j][i]  = J*p_tauInv*(r_q[2]*r_q[3]*icubq1);
-	    
-	    if(!p_SEMI_ANALYTIC){
-	      s_q[es][4][j][i] -= J*p_tauInv*r_q[4];
-	      s_q[es][5][j][i] -= J*p_tauInv*r_q[5];
-	      s_q[es][6][j][i] -= J*p_tauInv*r_q[6];
-	      s_q[es][7][j][i] -= J*p_tauInv*r_q[7];
-	      s_q[es][8][j][i] -= J*p_tauInv*r_q[8];
-	      s_q[es][9][j][i] -= J*p_tauInv*r_q[9];
-	    }
-	  }
-	}
+          dlong et = eo+es; // element in block
+          if (et<Nelements) {
+            const dlong gid = e*p_cubNp*p_Nvgeo+ j*p_cubNq +i;
+            const dfloat J = cubvgeo[gid + p_JID*p_cubNp];
+
+            const dfloat icubq1 = 1.f/r_q[0];
+            // BGK relaxation approximation to the Boltzmann collision operator for N5 - N10
+            // [ for LSERK the linear term is added below ]
+            s_q[es][4][j][i]  = J*p_tauInv*(p_invsqrt2*r_q[1]*r_q[1]*icubq1);
+            s_q[es][5][j][i]  = J*p_tauInv*(p_invsqrt2*r_q[2]*r_q[2]*icubq1);
+            s_q[es][6][j][i]  = J*p_tauInv*(p_invsqrt2*r_q[3]*r_q[3]*icubq1);
+
+            s_q[es][7][j][i]  = J*p_tauInv*(r_q[1]*r_q[2]*icubq1);
+            s_q[es][8][j][i]  = J*p_tauInv*(r_q[1]*r_q[3]*icubq1);
+            s_q[es][9][j][i]  = J*p_tauInv*(r_q[2]*r_q[3]*icubq1);
+
+            if(!p_SEMI_ANALYTIC){
+              s_q[es][4][j][i] -= J*p_tauInv*r_q[4];
+              s_q[es][5][j][i] -= J*p_tauInv*r_q[5];
+              s_q[es][6][j][i] -= J*p_tauInv*r_q[6];
+              s_q[es][7][j][i] -= J*p_tauInv*r_q[7];
+              s_q[es][8][j][i] -= J*p_tauInv*r_q[8];
+              s_q[es][9][j][i] -= J*p_tauInv*r_q[9];
+            }
+          }
+        }
       }
     }
-    
-    @barrier("local");
-    
+
+
     //project in j
-    for(int es=0;es<p_NblockV;++es;@inner(2)){   
-      for(int j=0;j<p_cubNq;++j;@inner(1)){ 
+    for(int es=0;es<p_NblockV;++es;@inner(2)){
+      for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
-	  
+
           for(int fld=0; fld<p_Nfields; fld++)
-            r_q[fld] = 0.f; 
-	  
+            r_q[fld] = 0.f;
+
           if (j<p_Nq) {
 #pragma unroll p_cubNq
             for (int n=0;n<p_cubNq;n++) {
@@ -269,36 +264,34 @@
         }
       }
     }
-    
-    @barrier("local"); 
+
 
     //write register back to @shared
-    for(int es=0;es<p_NblockV;++es;@inner(2)){   
-      for(int j=0;j<p_cubNq;++j;@inner(1)){ 
+    for(int es=0;es<p_NblockV;++es;@inner(2)){
+      for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
           if (j<p_Nq) {
             for(int fld=p_qNs; fld<p_Nfields; fld++){
-              s_q[es][fld][j][i] = r_q[fld];              
+              s_q[es][fld][j][i] = r_q[fld];
             }
           }
         }
       }
     }
-    
-    @barrier("local");
-    
+
+
     // project in i and update
-    for(int es=0;es<p_NblockV;++es;@inner(2)){ 
-      for(int j=0;j<p_cubNq;++j;@inner(1)){ 
-        for(int i=0;i<p_cubNq;++i;@inner(0)){  
+    for(int es=0;es<p_NblockV;++es;@inner(2)){
+      for(int j=0;j<p_cubNq;++j;@inner(1)){
+        for(int i=0;i<p_cubNq;++i;@inner(0)){
           const dlong et = eo+es; // element in block
           if((et<Nelements) && (i<p_Nq) && (j<p_Nq)){
             const dlong gid = e*p_Np*p_Nvgeo+ j*p_Nq +i;
             const dfloat invJW = vgeo[gid + p_IJWID*p_Np];
-	    
+
             for(int fld=0; fld<p_Nfields; fld++)
-              r_q[fld] = 0.f; 
-	    
+              r_q[fld] = 0.f;
+
 #pragma unroll p_cubNq
             for(int n=0;n<p_cubNq;++n){
               const dfloat Pni = s_cubProjectT[n][i];
@@ -306,13 +299,13 @@
                 r_q[fld] += Pni*s_q[es][fld][j][n];
               }
             }
-	    
+
             dlong rhsId = e*p_Np*p_Nfields + j*p_Nq + i;
-	    // 
-	    if(p_MRSAAB)
-	      rhsId     += shift*offset;
-	    
-	    for(int fld=p_qNs; fld<p_Nfields; fld++){
+            //
+            if(p_MRSAAB)
+              rhsId     += shift*offset;
+
+            for(int fld=p_qNs; fld<p_Nfields; fld++){
               rhsq[rhsId + fld*p_Np]  += invJW*r_q[fld];
             }
           }
diff --git a/solvers/bns/okl/bnsRelaxationTet3D.okl b/solvers/bns/okl/bnsRelaxationTet3D.okl
index 676263fa3..2764e80c2 100644
--- a/solvers/bns/okl/bnsRelaxationTet3D.okl
+++ b/solvers/bns/okl/bnsRelaxationTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -68,8 +68,6 @@ SOFTWARE.
       }
     }
 
-    // make sure all node data is loaded into @shared
-    @barrier("local");
 
     // interpolate q to cubature
     for(int es=0;es<p_NblockCub;++es;@inner(1)){
@@ -105,7 +103,6 @@ SOFTWARE.
     }
 
     //make sure all cubature node data is loaded into @shared
-    @barrier("local");
 
     // partial projection to nodes from cubature-sub-group
     for(int es=0;es<p_NblockCub;++es;@inner(1)){
@@ -201,7 +198,6 @@ SOFTWARE.
     }
 
     // make sure all node data is loaded into @shared
-    @barrier("local");
 
     // interpolate q to cubature
     for(int n=0;n<p_cubNp;++n;@inner(0)){
@@ -269,7 +265,6 @@ SOFTWARE.
     }
 
     // make sure all cubature node data is loaded into @shared
-    @barrier("local");
 
     // partial projection to nodes from cubature-sub-group
     for(int n=0;n<p_cubNp;++n;@inner(0)){
diff --git a/solvers/bns/okl/bnsRelaxationTri2D.okl b/solvers/bns/okl/bnsRelaxationTri2D.okl
index d73558c8e..126dff9ef 100644
--- a/solvers/bns/okl/bnsRelaxationTri2D.okl
+++ b/solvers/bns/okl/bnsRelaxationTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -68,7 +68,6 @@ SOFTWARE.
     }
 
     // make sure all node data is loaded into @shared
-    @barrier("local");
 
     // interpolate q to cubature
     for(int es=0;es<p_NblockCub;++es;@inner(1)){
@@ -100,7 +99,6 @@ SOFTWARE.
     }
 
     //make sure all cubature node data is loaded into @shared
-    @barrier("local");
 
     // partial projection to nodes from cubature-sub-group
     for(int es=0;es<p_NblockCub;++es;@inner(1)){
@@ -194,7 +192,6 @@ SOFTWARE.
     }
 
     // make sure all node data is loaded into @shared
-    @barrier("local");
 
     // interpolate q to cubature
     for(int n=0;n<p_cubNp;++n;@inner(0)){
@@ -250,7 +247,6 @@ SOFTWARE.
     }
 
     // make sure all cubature node data is loaded into @shared
-    @barrier("local");
 
     // partial projection to nodes from cubature-sub-group
     for(int n=0;n<p_cubNp;++n;@inner(0)){
diff --git a/solvers/bns/okl/bnsSurfaceHex3D.okl b/solvers/bns/okl/bnsSurfaceHex3D.okl
index 141ee1c31..35614e759 100644
--- a/solvers/bns/okl/bnsSurfaceHex3D.okl
+++ b/solvers/bns/okl/bnsSurfaceHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -608,7 +608,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -625,7 +624,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -641,7 +639,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -657,7 +654,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -717,7 +713,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -734,7 +729,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -750,7 +744,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -766,7 +759,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -830,7 +822,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -847,7 +838,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -863,7 +853,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -879,7 +868,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -939,7 +927,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -956,7 +943,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -972,7 +958,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -988,7 +973,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
diff --git a/solvers/bns/okl/bnsSurfaceQuad2D.okl b/solvers/bns/okl/bnsSurfaceQuad2D.okl
index e31d65299..36ecab595 100644
--- a/solvers/bns/okl/bnsSurfaceQuad2D.okl
+++ b/solvers/bns/okl/bnsSurfaceQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -474,7 +474,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 0 & 2
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -494,7 +493,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -514,7 +512,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockS;++es;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -574,7 +571,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 0 & 2
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -594,7 +590,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -614,7 +609,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockS;++es;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -677,7 +671,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 0 & 2
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -697,7 +690,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -717,7 +709,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     //
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -778,7 +769,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 0 & 2
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -798,7 +788,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -818,7 +807,6 @@ void surfaceMRTerms_split(const int e,
       }
     }
 
-    @barrier("local");
 
     //
     for(int es=0;es<p_NblockS;++es;@inner(1)){
diff --git a/solvers/bns/okl/bnsSurfaceQuad3D.okl b/solvers/bns/okl/bnsSurfaceQuad3D.okl
index f1866d09a..c28c89ad4 100644
--- a/solvers/bns/okl/bnsSurfaceQuad3D.okl
+++ b/solvers/bns/okl/bnsSurfaceQuad3D.okl
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -75,66 +75,62 @@ void flux(const dfloat nx,
     for(dlong face=0;face<p_Nfaces;++face){      
       
       for(dlong es=0;es<p_NblockS;++es;@inner(1)){
-  for(dlong i=0;i<p_Nq;++i;@inner(0)){ 
-    
-    const dlong et = eo + es;
-    
-    if(et<Nelements){
+        for(dlong i=0;i<p_Nq;++i;@inner(0)){
 
-      const dlong   eM = elementIds[et];
-      
-      // load surface geofactors for this face
-      const dlong sid = p_Nsgeo*(eM*p_Nq*p_Nfaces + face*p_Nq + i); 
-      const dfloat nx = sgeo[sid+p_NXID];
-      const dfloat ny = sgeo[sid+p_NYID];
-      const dfloat nz = sgeo[sid+p_NZID];
-      const dfloat sJ = sgeo[sid+p_SJID];
-      const dfloat invJ = sgeo[sid+p_IJID];
-
-      // indices of negative and positive traces of face node
-      const dlong id  = eM*p_Nq*p_Nfaces + face*p_Nq + i;
-      const int idM = vmapM[id];
-      const int idP = vmapP[id];
-      const int eP = idP/p_Np;
-      
-      const dlong vidM = eM*p_Np*p_Nfields + (idM%p_Np);
-      const dlong vidP = eP*p_Np*p_Nfields + (idP%p_Np);
-      
-      dfloat dq[p_Nfields], fluxq[p_Nfields];
-      
-#pragma unroll p_Nfields
-      for(int fld=0;fld<p_Nfields;++fld){
-        const dfloat qM = q[vidM+fld*p_Np];
-        dfloat qP = q[vidP+fld*p_Np]; // assumes chunks of p_Np*p_Nfields per element
-        dq[fld] = qP-qM;
-      }
-      
-      
-      // D = nx*A + ny*B + nz*C
-        // compute 0.5*[q] + 0.5*D*[q]/sqrt(RT)
-      const dfloat alpha = 0.5, beta = 0.5*p_isqrtRT;
-      
-      flux(nx, ny, nz, alpha, beta, dq, fluxq);
-      
-      const dfloat one = 1.0, zero = 0.0;
-      
-      // compute D*(0.5*[q] + 0.5*D*[q])
-      flux(nx, ny, nz, zero, one, fluxq, dq);
-      
-      // evaluate "flux" terms: (sJ/J)*(D+sigma*D*D)*(q^* - q^-)
-      const dfloat sc = invJ*sJ*p_LIFT;
-      
-#pragma unroll p_Nfields
-      for(int fld=0;fld<p_Nfields;++fld){
-        rhsq[vidM+fld*p_Np] += sc*dq[fld]; // TW dq[fld];
-      }
-    }
-  }
+          const dlong et = eo + es;
+
+          if(et<Nelements){
+
+            const dlong   eM = elementIds[et];
+
+            // load surface geofactors for this face
+            const dlong sid = p_Nsgeo*(eM*p_Nq*p_Nfaces + face*p_Nq + i);
+            const dfloat nx = sgeo[sid+p_NXID];
+            const dfloat ny = sgeo[sid+p_NYID];
+            const dfloat nz = sgeo[sid+p_NZID];
+            const dfloat sJ = sgeo[sid+p_SJID];
+            const dfloat invJ = sgeo[sid+p_IJID];
+
+            // indices of negative and positive traces of face node
+            const dlong id  = eM*p_Nq*p_Nfaces + face*p_Nq + i;
+            const int idM = vmapM[id];
+            const int idP = vmapP[id];
+            const int eP = idP/p_Np;
+
+            const dlong vidM = eM*p_Np*p_Nfields + (idM%p_Np);
+            const dlong vidP = eP*p_Np*p_Nfields + (idP%p_Np);
+
+            dfloat dq[p_Nfields], fluxq[p_Nfields];
+
+      #pragma unroll p_Nfields
+            for(int fld=0;fld<p_Nfields;++fld){
+              const dfloat qM = q[vidM+fld*p_Np];
+              dfloat qP = q[vidP+fld*p_Np]; // assumes chunks of p_Np*p_Nfields per element
+              dq[fld] = qP-qM;
+            }
+
+
+            // D = nx*A + ny*B + nz*C
+              // compute 0.5*[q] + 0.5*D*[q]/sqrt(RT)
+            const dfloat alpha = 0.5, beta = 0.5*p_isqrtRT;
+
+            flux(nx, ny, nz, alpha, beta, dq, fluxq);
+
+            const dfloat one = 1.0, zero = 0.0;
+
+            // compute D*(0.5*[q] + 0.5*D*[q])
+            flux(nx, ny, nz, zero, one, fluxq, dq);
+
+            // evaluate "flux" terms: (sJ/J)*(D+sigma*D*D)*(q^* - q^-)
+            const dfloat sc = invJ*sJ*p_LIFT;
+
+      #pragma unroll p_Nfields
+            for(int fld=0;fld<p_Nfields;++fld){
+              rhsq[vidM+fld*p_Np] += sc*dq[fld]; // TW dq[fld];
+            }
+          }
+        }
       }
-      
-      // wait for all global memory writes of the previous inner loop to complete
-      @barrier("global");
-      
     }
   }
 }
@@ -174,61 +170,58 @@ void flux(const dfloat nx,
     
           if(et<Nelements){
 
-          const dlong   eM = elementIds[et];
-      
-          // load surface geofactors for this face
-          const dlong sid = p_Nsgeo*(eM*p_Nq*p_Nfaces + face*p_Nq + i); 
-          const dfloat nx = sgeo[sid+p_NXID];
-          const dfloat ny = sgeo[sid+p_NYID];
-          const dfloat nz = sgeo[sid+p_NZID];
-          const dfloat sJ = sgeo[sid+p_SJID];
-          const dfloat invJ = sgeo[sid+p_IJID];
-
-          // indices of negative and positive traces of face node
-          const dlong idM  = eM*p_Nq*p_Nfaces + face*p_Nq + i;
-          const dlong idP  = mapP[idM];
-          const dlong eP   = idP/(p_Nq*p_Nfaces);
-
-                
-          const dlong qidM = eM*p_Nfp*p_Nfaces*p_Nfields + idM%(p_Nq*p_Nfaces);                
-          const dlong qidP = eP*p_Nfp*p_Nfaces*p_Nfields + idP%(p_Nq*p_Nfaces);      
-
-          dfloat dq[p_Nfields], fluxq[p_Nfields];
-      
-          #pragma unroll p_Nfields
-          for(int fld=0;fld<p_Nfields;++fld){
-            const dfloat qM = fQM[qidM+fld*p_Nq*p_Nfaces];
-                  dfloat qP = fQM[qidP+fld*p_Nq*p_Nfaces]; // assumes chunks of p_Np*p_Nfields per element
-                  dq[fld]   = qP-qM;
-          }
-  
-      
-        // D = nx*A + ny*B + nz*C
-        // compute 0.5*[q] + 0.5*D*[q]/sqrt(RT)
-        const dfloat alpha = 0.5, beta = 0.5*p_isqrtRT;
+            const dlong   eM = elementIds[et];
+
+            // load surface geofactors for this face
+            const dlong sid = p_Nsgeo*(eM*p_Nq*p_Nfaces + face*p_Nq + i);
+            const dfloat nx = sgeo[sid+p_NXID];
+            const dfloat ny = sgeo[sid+p_NYID];
+            const dfloat nz = sgeo[sid+p_NZID];
+            const dfloat sJ = sgeo[sid+p_SJID];
+            const dfloat invJ = sgeo[sid+p_IJID];
+
+            // indices of negative and positive traces of face node
+            const dlong idM  = eM*p_Nq*p_Nfaces + face*p_Nq + i;
+            const dlong idP  = mapP[idM];
+            const dlong eP   = idP/(p_Nq*p_Nfaces);
+
+
+            const dlong qidM = eM*p_Nfp*p_Nfaces*p_Nfields + idM%(p_Nq*p_Nfaces);
+            const dlong qidP = eP*p_Nfp*p_Nfaces*p_Nfields + idP%(p_Nq*p_Nfaces);
+
+            dfloat dq[p_Nfields], fluxq[p_Nfields];
+
+            #pragma unroll p_Nfields
+            for(int fld=0;fld<p_Nfields;++fld){
+              const dfloat qM = fQM[qidM+fld*p_Nq*p_Nfaces];
+                    dfloat qP = fQM[qidP+fld*p_Nq*p_Nfaces]; // assumes chunks of p_Np*p_Nfields per element
+                    dq[fld]   = qP-qM;
+            }
 
-        flux(nx, ny, nz, alpha, beta, dq, fluxq);
 
-        const dfloat one = 1.0, zero = 0.0;
+          // D = nx*A + ny*B + nz*C
+          // compute 0.5*[q] + 0.5*D*[q]/sqrt(RT)
+          const dfloat alpha = 0.5, beta = 0.5*p_isqrtRT;
 
-        // compute D*(0.5*[q] + 0.5*D*[q])
-        flux(nx, ny, nz, zero, one, fluxq, dq);
+          flux(nx, ny, nz, alpha, beta, dq, fluxq);
 
-        // evaluate "flux" terms: (sJ/J)*(D+sigma*D*D)*(q^* - q^-)
-        const dfloat sc = invJ*sJ*p_LIFT;
-        
-        const dlong vidM = eM*p_Np*p_Nfields + (vmapM[idM]%p_Np) + shift*offset; 
+          const dfloat one = 1.0, zero = 0.0;
 
-        #pragma unroll p_Nfields
-        for(int fld=0;fld<p_Nfields;++fld){
-          rhsq[vidM+fld*p_Np] += sc*dq[fld]; // TW dq[fld];
+          // compute D*(0.5*[q] + 0.5*D*[q])
+          flux(nx, ny, nz, zero, one, fluxq, dq);
+
+          // evaluate "flux" terms: (sJ/J)*(D+sigma*D*D)*(q^* - q^-)
+          const dfloat sc = invJ*sJ*p_LIFT;
+
+          const dlong vidM = eM*p_Np*p_Nfields + (vmapM[idM]%p_Np) + shift*offset;
+
+          #pragma unroll p_Nfields
+          for(int fld=0;fld<p_Nfields;++fld){
+            rhsq[vidM+fld*p_Np] += sc*dq[fld]; // TW dq[fld];
+            }
           }
         }
       }
-    }
-      // wait for all global memory writes of the previous inner loop to complete
-      @barrier("global");
-      
     }
   }
 }
diff --git a/solvers/bns/okl/bnsSurfaceTet3D.okl b/solvers/bns/okl/bnsSurfaceTet3D.okl
index 02beec99a..126a9bac7 100644
--- a/solvers/bns/okl/bnsSurfaceTet3D.okl
+++ b/solvers/bns/okl/bnsSurfaceTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -245,7 +245,6 @@ void upwind_split(const dfloat nx,  const dfloat ny,  const dfloat nz, const dfl
     }
 
     // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
 
     // for each node in the element
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
@@ -413,7 +412,6 @@ void upwind_split(const dfloat nx,  const dfloat ny,  const dfloat nz, const dfl
     }
 
     // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
 
     // for each node in the element
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
@@ -591,7 +589,6 @@ void upwind_split(const dfloat nx,  const dfloat ny,  const dfloat nz, const dfl
     }
 
     // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
 
     // for each node in the element
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
@@ -757,7 +754,6 @@ void upwind_split(const dfloat nx,  const dfloat ny,  const dfloat nz, const dfl
     }
 
     // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
 
     // for each node in the element
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
diff --git a/solvers/bns/okl/bnsSurfaceTri2D.okl b/solvers/bns/okl/bnsSurfaceTri2D.okl
index 304ef5f4d..9b3700742 100644
--- a/solvers/bns/okl/bnsSurfaceTri2D.okl
+++ b/solvers/bns/okl/bnsSurfaceTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -164,7 +164,6 @@ void upwind(const dfloat nx, const dfloat ny, const dfloat c,
     }
 
     // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -320,7 +319,6 @@ void upwind(const dfloat nx, const dfloat ny, const dfloat c,
     }
 
     // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -514,7 +512,6 @@ void upwind_split(const dfloat nx, const dfloat ny, const dfloat c,
     }
 
     // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -699,7 +696,6 @@ void upwind_split(const dfloat nx, const dfloat ny, const dfloat c,
     }
 
     // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
diff --git a/solvers/bns/okl/bnsVolumeHex3D.okl b/solvers/bns/okl/bnsVolumeHex3D.okl
index bbf6310e3..f0b23da2b 100644
--- a/solvers/bns/okl/bnsVolumeHex3D.okl
+++ b/solvers/bns/okl/bnsVolumeHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -75,7 +75,6 @@ SOFTWARE.
 
 
     // make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int k=0;k<p_Nq;++k;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -225,7 +224,6 @@ SOFTWARE.
       }
     }
     // make sure all node data is loaded into @shared
-    @barrier("local");
     for(int k=0; k<p_Nq; ++k;@inner(2)){
       for(int j=0; j<p_Nq; ++j;@inner(1)){
         for(int i=0; i<p_Nq; ++i; @inner(0)){
@@ -407,7 +405,6 @@ SOFTWARE.
       }
     }
     // make sure all node data is loaded into @shared
-    @barrier("local");
     for(int k=0; k<p_Nq; ++k;@inner(2)){
       for(int j=0; j<p_Nq; ++j;@inner(1)){
         for(int i=0; i<p_Nq; ++i; @inner(0)){
diff --git a/solvers/bns/okl/bnsVolumeQuad2D.okl b/solvers/bns/okl/bnsVolumeQuad2D.okl
index 6b6e6a006..c63e4d48b 100644
--- a/solvers/bns/okl/bnsVolumeQuad2D.okl
+++ b/solvers/bns/okl/bnsVolumeQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -76,7 +76,6 @@ SOFTWARE.
 
 
     // make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(2)){// for all elements in block
       for(int j=0; j<p_Nq; ++j;@inner(1)){
@@ -204,7 +203,6 @@ SOFTWARE.
       }
     }
     // make sure all node data is loaded into @shared
-    @barrier("local");
     for(int es=0;es<p_NblockV;++es;@inner(2)){// for all elements in block
       for(int j=0; j<p_Nq; ++j;@inner(1)){
         for(int i=0; i<p_Nq; ++i; @inner(0)){
@@ -349,7 +347,6 @@ SOFTWARE.
       }
     }
     // make sure all node data is loaded into @shared
-    @barrier("local");
     for(int es=0;es<p_NblockV;++es;@inner(2)){// for all elements in block
       for(int j=0; j<p_Nq; ++j;@inner(1)){
         for(int i=0; i<p_Nq; ++i; @inner(0)){
diff --git a/solvers/bns/okl/bnsVolumeQuad3D.okl b/solvers/bns/okl/bnsVolumeQuad3D.okl
index 398c0f63f..486849f92 100644
--- a/solvers/bns/okl/bnsVolumeQuad3D.okl
+++ b/solvers/bns/okl/bnsVolumeQuad3D.okl
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -26,223 +26,222 @@
 #define TDIR 1
 // Only difference with boltzmannVolume2D is size of rhsq and shifting
 @kernel void bnsVolumeQuad3D(const dlong Nelements,
-			     @restrict const  dlong *  elementIds,
-			     const dlong offset,
-			     const int shift,
-			     const dfloat fx,
-			     const dfloat fy,
-			     const dfloat fz,
-			     @restrict const  dfloat *  vgeo,
-			     @restrict const  dfloat * x, 
-			     @restrict const  dfloat * y,
-			     @restrict const  dfloat * z, 
-			     @restrict const  dfloat *  Dmatrices,
-			     @restrict const  dfloat *  q,
-			     @restrict dfloat *  rhsq){
-  
+                             @restrict const  dlong *  elementIds,
+                             const dlong offset,
+                             const int shift,
+                             const dfloat fx,
+                             const dfloat fy,
+                             const dfloat fz,
+                             @restrict const  dfloat *  vgeo,
+                             @restrict const  dfloat * x,
+                             @restrict const  dfloat * y,
+                             @restrict const  dfloat * z,
+                             @restrict const  dfloat *  Dmatrices,
+                             @restrict const  dfloat *  q,
+                             @restrict dfloat *  rhsq){
+
   for(dlong eo=0;eo<Nelements;eo+=p_NblockV;@outer(0)){  // for all elements
-    
+
     @shared dfloat s_q[p_Nfields][p_NblockV][p_Nq][p_Nq];
     @shared dfloat s_D[p_Nq][p_Nq];
     @shared dfloat s_DW[p_Nq][p_Nq];
 
     for(int es=0;es<p_NblockV;++es;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
-	for(int i=0;i<p_Nq;++i;@inner(0)){
-
-	  const dlong et = eo+es; // element in block
-	  if(et<Nelements){
-	    const dlong e = elementIds[et];
-	    const int base = i + j*p_Nq + p_Nfields*p_Np*e;
-	    for(int fld=0;fld<p_Nfields;++fld){
-	      s_q[fld][es][j][i] = q[base+fld*p_Np];
-	    }
-	  }
-	  
-	  if(es==0){
-	    s_D[j][i] = Dmatrices[j*p_Nq+i];
-	    s_DW[j][i] = Dmatrices[p_Nq*p_Nq + j*p_Nq+i];
-	  }
-	}
+        for(int i=0;i<p_Nq;++i;@inner(0)){
+
+          const dlong et = eo+es; // element in block
+          if(et<Nelements){
+            const dlong e = elementIds[et];
+            const int base = i + j*p_Nq + p_Nfields*p_Np*e;
+            for(int fld=0;fld<p_Nfields;++fld){
+              s_q[fld][es][j][i] = q[base+fld*p_Np];
+            }
+          }
+
+          if(es==0){
+            s_D[j][i] = Dmatrices[j*p_Nq+i];
+            s_DW[j][i] = Dmatrices[p_Nq*p_Nq + j*p_Nq+i];
+          }
+        }
       }
     }
-    
+
     // make sure all node data is loaded into shared
-    @barrier("local");
-    
+
     for(int es=0;es<p_NblockV;++es;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
-	for(int i=0;i<p_Nq;++i;@inner(0)){
-	  const dlong et = eo+es; // element in block
-
-	  if(et<Nelements){
-            const dlong e = elementIds[et];   
-	    const int n=j*p_Nq+i;
-
-	    // prefetch geometric factors (constant on triangle)
-	    const int gbase = e*p_Np*p_Nvgeo + n;
-	    const dfloat drdx = vgeo[gbase + p_Np*p_RXID];
-	    const dfloat drdy = vgeo[gbase + p_Np*p_RYID];
-	    const dfloat drdz = vgeo[gbase + p_Np*p_RZID];
-	    const dfloat dsdx = vgeo[gbase + p_Np*p_SXID];
-	    const dfloat dsdy = vgeo[gbase + p_Np*p_SYID];
-	    const dfloat dsdz = vgeo[gbase + p_Np*p_SZID];
+        for(int i=0;i<p_Nq;++i;@inner(0)){
+          const dlong et = eo+es; // element in block
+
+          if(et<Nelements){
+            const dlong e = elementIds[et];
+            const int n=j*p_Nq+i;
+
+            // prefetch geometric factors (constant on triangle)
+            const int gbase = e*p_Np*p_Nvgeo + n;
+            const dfloat drdx = vgeo[gbase + p_Np*p_RXID];
+            const dfloat drdy = vgeo[gbase + p_Np*p_RYID];
+            const dfloat drdz = vgeo[gbase + p_Np*p_RZID];
+            const dfloat dsdx = vgeo[gbase + p_Np*p_SXID];
+            const dfloat dsdy = vgeo[gbase + p_Np*p_SYID];
+            const dfloat dsdz = vgeo[gbase + p_Np*p_SZID];
 #if TDIR
-	    const dfloat dtdx = vgeo[gbase + p_Np*p_TXID];
-	    const dfloat dtdy = vgeo[gbase + p_Np*p_TYID];
-	    const dfloat dtdz = vgeo[gbase + p_Np*p_TZID];
+            const dfloat dtdx = vgeo[gbase + p_Np*p_TXID];
+            const dfloat dtdy = vgeo[gbase + p_Np*p_TYID];
+            const dfloat dtdz = vgeo[gbase + p_Np*p_TZID];
 #endif
 
-	    // compute 'r' and 's' derivatives of (q_m) at node n
-	    dfloat dqdr[p_Nfields], dqds[p_Nfields];
-	    
+            // compute 'r' and 's' derivatives of (q_m) at node n
+            dfloat dqdr[p_Nfields], dqds[p_Nfields];
+
 #pragma unroll p_Nfields
-	    for(int fld=0;fld<p_Nfields;++fld){
-	      dqdr[fld] = 0;
-	      dqds[fld] = 0;
-	    }
-	    
+            for(int fld=0;fld<p_Nfields;++fld){
+              dqdr[fld] = 0;
+              dqds[fld] = 0;
+            }
+
 #pragma unroll p_Nq
-	    for(int m=0;m<p_Nq;++m){
-	      const dfloat Dim = s_D[i][m]; // could L1
-	      const dfloat Djm = s_D[j][m];
-	      
-	      // differentiate q components with respect to 'r'
+            for(int m=0;m<p_Nq;++m){
+              const dfloat Dim = s_D[i][m]; // could L1
+              const dfloat Djm = s_D[j][m];
+
+              // differentiate q components with respect to 'r'
 #pragma unroll p_Nfields
-	      for(int fld=0;fld<p_Nfields;++fld)
-		dqdr[fld] += Dim*s_q[fld][es][j][m]; // 4 bytes per flop (2TF max)
+              for(int fld=0;fld<p_Nfields;++fld)
+                dqdr[fld] += Dim*s_q[fld][es][j][m]; // 4 bytes per flop (2TF max)
 
-	      // differentiate q components with respect to 's'
+              // differentiate q components with respect to 's'
 #pragma unroll p_Nfields
-	      for(int fld=0;fld<p_Nfields;++fld)
-		dqds[fld] += Djm*s_q[fld][es][m][i];
-	    }
-	    
-	    const dfloat q1  = s_q[0][es][j][i];
-	    const dfloat q2  = s_q[1][es][j][i];
-	    const dfloat q3  = s_q[2][es][j][i];
-	    const dfloat q4  = s_q[3][es][j][i];
-	    const dfloat q5  = s_q[4][es][j][i];
-	    const dfloat q6  = s_q[5][es][j][i];
-	    const dfloat q7  = s_q[6][es][j][i];
-	    const dfloat q8  = s_q[7][es][j][i];
-	    const dfloat q9  = s_q[8][es][j][i];
-	    const dfloat q10 = s_q[9][es][j][i];
-
-	    const dfloat invq1 = 1.f/q1;
+              for(int fld=0;fld<p_Nfields;++fld)
+                dqds[fld] += Djm*s_q[fld][es][m][i];
+            }
+
+            const dfloat q1  = s_q[0][es][j][i];
+            const dfloat q2  = s_q[1][es][j][i];
+            const dfloat q3  = s_q[2][es][j][i];
+            const dfloat q4  = s_q[3][es][j][i];
+            const dfloat q5  = s_q[4][es][j][i];
+            const dfloat q6  = s_q[5][es][j][i];
+            const dfloat q7  = s_q[6][es][j][i];
+            const dfloat q8  = s_q[7][es][j][i];
+            const dfloat q9  = s_q[8][es][j][i];
+            const dfloat q10 = s_q[9][es][j][i];
+
+            const dfloat invq1 = 1.f/q1;
 #if TDIR
-	    const dfloat dq1dx = drdx*dqdr[0] + dsdx*dqds[0] + dtdx*q1;
-	    const dfloat dq1dy = drdy*dqdr[0] + dsdy*dqds[0] + dtdy*q1;
-	    const dfloat dq1dz = drdz*dqdr[0] + dsdz*dqds[0] + dtdz*q1;
+            const dfloat dq1dx = drdx*dqdr[0] + dsdx*dqds[0] + dtdx*q1;
+            const dfloat dq1dy = drdy*dqdr[0] + dsdy*dqds[0] + dtdy*q1;
+            const dfloat dq1dz = drdz*dqdr[0] + dsdz*dqds[0] + dtdz*q1;
 
-	    const dfloat dq2dx = drdx*dqdr[1] + dsdx*dqds[1] + dtdx*q2;
-	    const dfloat dq2dy = drdy*dqdr[1] + dsdy*dqds[1] + dtdy*q2;
-	    const dfloat dq2dz = drdz*dqdr[1] + dsdz*dqds[1] + dtdz*q2;
+            const dfloat dq2dx = drdx*dqdr[1] + dsdx*dqds[1] + dtdx*q2;
+            const dfloat dq2dy = drdy*dqdr[1] + dsdy*dqds[1] + dtdy*q2;
+            const dfloat dq2dz = drdz*dqdr[1] + dsdz*dqds[1] + dtdz*q2;
 
-	    const dfloat dq3dx = drdx*dqdr[2] + dsdx*dqds[2] + dtdx*q3;
-	    const dfloat dq3dy = drdy*dqdr[2] + dsdy*dqds[2] + dtdy*q3;
-	    const dfloat dq3dz = drdz*dqdr[2] + dsdz*dqds[2] + dtdz*q3;
+            const dfloat dq3dx = drdx*dqdr[2] + dsdx*dqds[2] + dtdx*q3;
+            const dfloat dq3dy = drdy*dqdr[2] + dsdy*dqds[2] + dtdy*q3;
+            const dfloat dq3dz = drdz*dqdr[2] + dsdz*dqds[2] + dtdz*q3;
 
-	    const dfloat dq4dx = drdx*dqdr[3] + dsdx*dqds[3] + dtdx*q4;
-	    const dfloat dq4dy = drdy*dqdr[3] + dsdy*dqds[3] + dtdy*q4;
-	    const dfloat dq4dz = drdz*dqdr[3] + dsdz*dqds[3] + dtdz*q4;
+            const dfloat dq4dx = drdx*dqdr[3] + dsdx*dqds[3] + dtdx*q4;
+            const dfloat dq4dy = drdy*dqdr[3] + dsdy*dqds[3] + dtdy*q4;
+            const dfloat dq4dz = drdz*dqdr[3] + dsdz*dqds[3] + dtdz*q4;
 
-	    const dfloat dq5dx = drdx*dqdr[4] + dsdx*dqds[4] + dtdx*q5;
-	    const dfloat dq6dy = drdy*dqdr[5] + dsdy*dqds[5] + dtdy*q6;
-	    const dfloat dq7dz = drdz*dqdr[6] + dsdz*dqds[6] + dtdz*q7;
+            const dfloat dq5dx = drdx*dqdr[4] + dsdx*dqds[4] + dtdx*q5;
+            const dfloat dq6dy = drdy*dqdr[5] + dsdy*dqds[5] + dtdy*q6;
+            const dfloat dq7dz = drdz*dqdr[6] + dsdz*dqds[6] + dtdz*q7;
 
-	    const dfloat dq8dx = drdx*dqdr[7] + dsdx*dqds[7] + dtdx*q8;
-	    const dfloat dq8dy = drdy*dqdr[7] + dsdy*dqds[7] + dtdy*q8;
+            const dfloat dq8dx = drdx*dqdr[7] + dsdx*dqds[7] + dtdx*q8;
+            const dfloat dq8dy = drdy*dqdr[7] + dsdy*dqds[7] + dtdy*q8;
 
-	    const dfloat dq9dx = drdx*dqdr[8] + dsdx*dqds[8] + dtdx*q9;
-	    const dfloat dq9dz = drdz*dqdr[8] + dsdz*dqds[8] + dtdz*q9;
+            const dfloat dq9dx = drdx*dqdr[8] + dsdx*dqds[8] + dtdx*q9;
+            const dfloat dq9dz = drdz*dqdr[8] + dsdz*dqds[8] + dtdz*q9;
 
-	    const dfloat dq10dy = drdy*dqdr[9] + dsdy*dqds[9] + dtdy*q10;
-	    const dfloat dq10dz = drdz*dqdr[9] + dsdz*dqds[9] + dtdz*q10;
+            const dfloat dq10dy = drdy*dqdr[9] + dsdy*dqds[9] + dtdy*q10;
+            const dfloat dq10dz = drdz*dqdr[9] + dsdz*dqds[9] + dtdz*q10;
 #else
-			const dfloat dq1dx = drdx*dqdr[0] + dsdx*dqds[0];
-			const dfloat dq1dy = drdy*dqdr[0] + dsdy*dqds[0];
-			const dfloat dq1dz = drdz*dqdr[0] + dsdz*dqds[0];
+            const dfloat dq1dx = drdx*dqdr[0] + dsdx*dqds[0];
+            const dfloat dq1dy = drdy*dqdr[0] + dsdy*dqds[0];
+            const dfloat dq1dz = drdz*dqdr[0] + dsdz*dqds[0];
 
-			const dfloat dq2dx = drdx*dqdr[1] + dsdx*dqds[1];
-			const dfloat dq2dy = drdy*dqdr[1] + dsdy*dqds[1];
-			const dfloat dq2dz = drdz*dqdr[1] + dsdz*dqds[1];
+            const dfloat dq2dx = drdx*dqdr[1] + dsdx*dqds[1];
+            const dfloat dq2dy = drdy*dqdr[1] + dsdy*dqds[1];
+            const dfloat dq2dz = drdz*dqdr[1] + dsdz*dqds[1];
 
-			const dfloat dq3dx = drdx*dqdr[2] + dsdx*dqds[2];
-			const dfloat dq3dy = drdy*dqdr[2] + dsdy*dqds[2];
-			const dfloat dq3dz = drdz*dqdr[2] + dsdz*dqds[2];
+            const dfloat dq3dx = drdx*dqdr[2] + dsdx*dqds[2];
+            const dfloat dq3dy = drdy*dqdr[2] + dsdy*dqds[2];
+            const dfloat dq3dz = drdz*dqdr[2] + dsdz*dqds[2];
 
-			const dfloat dq4dx = drdx*dqdr[3] + dsdx*dqds[3];
-			const dfloat dq4dy = drdy*dqdr[3] + dsdy*dqds[3];
-			const dfloat dq4dz = drdz*dqdr[3] + dsdz*dqds[3];
+            const dfloat dq4dx = drdx*dqdr[3] + dsdx*dqds[3];
+            const dfloat dq4dy = drdy*dqdr[3] + dsdy*dqds[3];
+            const dfloat dq4dz = drdz*dqdr[3] + dsdz*dqds[3];
 
-			const dfloat dq5dx = drdx*dqdr[4] + dsdx*dqds[4];
-			const dfloat dq6dy = drdy*dqdr[5] + dsdy*dqds[5];
-			const dfloat dq7dz = drdz*dqdr[6] + dsdz*dqds[6];
+            const dfloat dq5dx = drdx*dqdr[4] + dsdx*dqds[4];
+            const dfloat dq6dy = drdy*dqdr[5] + dsdy*dqds[5];
+            const dfloat dq7dz = drdz*dqdr[6] + dsdz*dqds[6];
 
-			const dfloat dq8dx = drdx*dqdr[7] + dsdx*dqds[7];
-			const dfloat dq8dy = drdy*dqdr[7] + dsdy*dqds[7];
+            const dfloat dq8dx = drdx*dqdr[7] + dsdx*dqds[7];
+            const dfloat dq8dy = drdy*dqdr[7] + dsdy*dqds[7];
 
-			const dfloat dq9dx = drdx*dqdr[8] + dsdx*dqds[8];
-			const dfloat dq9dz = drdz*dqdr[8] + dsdz*dqds[8];
+            const dfloat dq9dx = drdx*dqdr[8] + dsdx*dqds[8];
+            const dfloat dq9dz = drdz*dqdr[8] + dsdz*dqds[8];
 
-			const dfloat dq10dy = drdy*dqdr[9] + dsdy*dqds[9];
-			const dfloat dq10dz = drdz*dqdr[9] + dsdz*dqds[9];
+            const dfloat dq10dy = drdy*dqdr[9] + dsdy*dqds[9];
+            const dfloat dq10dz = drdz*dqdr[9] + dsdz*dqds[9];
 #endif
 
-	    const dfloat rhsq1 = -p_sqrtRT*(dq2dx + dq3dy + dq4dz);
+            const dfloat rhsq1 = -p_sqrtRT*(dq2dx + dq3dy + dq4dz);
 
-	    dfloat rhsq2 = -p_sqrtRT*(dq1dx + p_sqrt2*dq5dx + dq8dy + dq9dz);
-	    dfloat rhsq3 = -p_sqrtRT*(dq1dy + p_sqrt2*dq6dy + dq8dx + dq10dz);
-	    dfloat rhsq4 = -p_sqrtRT*(dq1dz + p_sqrt2*dq7dz + dq9dx + dq10dy);
+            dfloat rhsq2 = -p_sqrtRT*(dq1dx + p_sqrt2*dq5dx + dq8dy + dq9dz);
+            dfloat rhsq3 = -p_sqrtRT*(dq1dy + p_sqrt2*dq6dy + dq8dx + dq10dz);
+            dfloat rhsq4 = -p_sqrtRT*(dq1dz + p_sqrt2*dq7dz + dq9dx + dq10dy);
 
-	    const dfloat rhsq5 = -p_sqrtRT*p_sqrt2*dq2dx;
-	    const dfloat rhsq6 = -p_sqrtRT*p_sqrt2*dq3dy;
-	    const dfloat rhsq7 = -p_sqrtRT*p_sqrt2*dq4dz;
+            const dfloat rhsq5 = -p_sqrtRT*p_sqrt2*dq2dx;
+            const dfloat rhsq6 = -p_sqrtRT*p_sqrt2*dq3dy;
+            const dfloat rhsq7 = -p_sqrtRT*p_sqrt2*dq4dz;
 
-	    const dfloat rhsq8  = -p_sqrtRT*(dq3dx+dq2dy);
-	    const dfloat rhsq9  = -p_sqrtRT*(dq4dx+dq2dz);
-	    const dfloat rhsq10 = -p_sqrtRT*(dq4dy+dq3dz);
+            const dfloat rhsq8  = -p_sqrtRT*(dq3dx+dq2dy);
+            const dfloat rhsq9  = -p_sqrtRT*(dq4dx+dq2dz);
+            const dfloat rhsq10 = -p_sqrtRT*(dq4dy+dq3dz);
 
-	    // constrain momentum changes to lie on sphere
-	    const dfloat xij = x[i+j*p_Nq+e*p_Np];
-	    const dfloat yij = y[i+j*p_Nq+e*p_Np];
-	    const dfloat zij = z[i+j*p_Nq+e*p_Np];
+            // constrain momentum changes to lie on sphere
+            const dfloat xij = x[i+j*p_Nq+e*p_Np];
+            const dfloat yij = y[i+j*p_Nq+e*p_Np];
+            const dfloat zij = z[i+j*p_Nq+e*p_Np];
 
 
-	    // add coriolis force to momentum equation
-	    rhsq2 -= p_fainv*zij*(yij*q4-zij*q3);
-	    rhsq3 -= p_fainv*zij*(zij*q2-xij*q4);
-	    rhsq4 -= p_fainv*zij*(xij*q3-yij*q2);
+            // add coriolis force to momentum equation
+            rhsq2 -= p_fainv*zij*(yij*q4-zij*q3);
+            rhsq3 -= p_fainv*zij*(zij*q2-xij*q4);
+            rhsq4 -= p_fainv*zij*(xij*q3-yij*q2);
 
 #if 0
-	    // remove radial component of momentum change
-	    const dfloat xdotrhsq = (rhsq2*xij + rhsq3*yij + rhsq4*zij)*p_invRadiusSq;
+            // remove radial component of momentum change
+            const dfloat xdotrhsq = (rhsq2*xij + rhsq3*yij + rhsq4*zij)*p_invRadiusSq;
 
-	    rhsq2 -= xij*xdotrhsq;
-	    rhsq3 -= yij*xdotrhsq;
-	    rhsq4 -= zij*xdotrhsq;
+            rhsq2 -= xij*xdotrhsq;
+            rhsq3 -= yij*xdotrhsq;
+            rhsq4 -= zij*xdotrhsq;
 #endif
 
-	    int base = e*p_Np*p_Nfields+n;
-
-	    if(p_MRSAAB){
-	      base += shift*offset;
-	    }
-
-	    rhsq[base+0*p_Np] = rhsq1;
-	    rhsq[base+1*p_Np] = rhsq2;
-	    rhsq[base+2*p_Np] = rhsq3;
-	    rhsq[base+3*p_Np] = rhsq4;
-	    rhsq[base+4*p_Np] = rhsq5;
-	    rhsq[base+5*p_Np] = rhsq6;
-	    rhsq[base+6*p_Np] = rhsq7;
-	    rhsq[base+7*p_Np] = rhsq8;
-	    rhsq[base+8*p_Np] = rhsq9;
-	    rhsq[base+9*p_Np] = rhsq10;
-	    
-	  }
-	}
+            int base = e*p_Np*p_Nfields+n;
+
+            if(p_MRSAAB){
+              base += shift*offset;
+            }
+
+            rhsq[base+0*p_Np] = rhsq1;
+            rhsq[base+1*p_Np] = rhsq2;
+            rhsq[base+2*p_Np] = rhsq3;
+            rhsq[base+3*p_Np] = rhsq4;
+            rhsq[base+4*p_Np] = rhsq5;
+            rhsq[base+5*p_Np] = rhsq6;
+            rhsq[base+6*p_Np] = rhsq7;
+            rhsq[base+7*p_Np] = rhsq8;
+            rhsq[base+8*p_Np] = rhsq9;
+            rhsq[base+9*p_Np] = rhsq10;
+
+          }
+        }
       }
     }
   }
diff --git a/solvers/bns/okl/bnsVolumeTet3D.okl b/solvers/bns/okl/bnsVolumeTet3D.okl
index c9047da5f..39643d94b 100644
--- a/solvers/bns/okl/bnsVolumeTet3D.okl
+++ b/solvers/bns/okl/bnsVolumeTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -67,7 +67,6 @@ SOFTWARE.
     }
 
     // make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int n=0;n<p_Np;++n;@inner(0)){     // for all nodes in this element
       // prefetch geometric factors (constant on triangle)
@@ -197,7 +196,6 @@ SOFTWARE.
     }
 
     // make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int n=0;n<p_Np;++n;@inner(0)){     // for all nodes in this element
       // prefetch geometric factors (constant on triangle)
@@ -364,7 +362,6 @@ SOFTWARE.
     }
 
     // make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int n=0;n<p_Np;++n;@inner(0)){     // for all nodes in this element
 
diff --git a/solvers/bns/okl/bnsVolumeTri2D.okl b/solvers/bns/okl/bnsVolumeTri2D.okl
index 0320e38c6..9abab0e6f 100644
--- a/solvers/bns/okl/bnsVolumeTri2D.okl
+++ b/solvers/bns/okl/bnsVolumeTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -69,7 +69,6 @@ SOFTWARE.
     }
 
     // make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(1)){// for all elements in block
       for(int n=0;n<p_Np;++n;@inner(0)){     // for all nodes in this element
@@ -186,7 +185,6 @@ SOFTWARE.
     }
 
     // make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(1)){// for all elements in block
       for(int n=0;n<p_Np;++n;@inner(0)){     // for all nodes in this element
@@ -322,7 +320,6 @@ SOFTWARE.
     }
 
     // make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(1)){// for all elements in block
       for(int n=0;n<p_Np;++n;@inner(0)){     // for all nodes in this element
@@ -416,4 +413,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/bns/okl/bnsVorticityHex3D.okl b/solvers/bns/okl/bnsVorticityHex3D.okl
index fdc5f884e..1f0930505 100644
--- a/solvers/bns/okl/bnsVorticityHex3D.okl
+++ b/solvers/bns/okl/bnsVorticityHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -59,7 +59,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int k=0;k<p_Nq;++k;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -118,4 +117,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/bns/okl/bnsVorticityQuad2D.okl b/solvers/bns/okl/bnsVorticityQuad2D.okl
index 24b088680..30fd97aa1 100644
--- a/solvers/bns/okl/bnsVorticityQuad2D.okl
+++ b/solvers/bns/okl/bnsVorticityQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -59,7 +59,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -96,4 +95,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/bns/okl/bnsVorticityQuad3D.okl b/solvers/bns/okl/bnsVorticityQuad3D.okl
index a404ac255..cb38d4341 100644
--- a/solvers/bns/okl/bnsVorticityQuad3D.okl
+++ b/solvers/bns/okl/bnsVorticityQuad3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -62,7 +62,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
diff --git a/solvers/bns/okl/bnsVorticityTet3D.okl b/solvers/bns/okl/bnsVorticityTet3D.okl
index 15e53c060..6bd49b399 100644
--- a/solvers/bns/okl/bnsVorticityTet3D.okl
+++ b/solvers/bns/okl/bnsVorticityTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -54,7 +54,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
diff --git a/solvers/bns/okl/bnsVorticityTri2D.okl b/solvers/bns/okl/bnsVorticityTri2D.okl
index 55ea85b58..91ab9aa04 100644
--- a/solvers/bns/okl/bnsVorticityTri2D.okl
+++ b/solvers/bns/okl/bnsVorticityTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -51,7 +51,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
diff --git a/solvers/bns/src/bnsPlotFields.cpp b/solvers/bns/src/bnsPlotFields.cpp
index ba42b8c68..9ca239934 100644
--- a/solvers/bns/src/bnsPlotFields.cpp
+++ b/solvers/bns/src/bnsPlotFields.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,11 +27,11 @@ SOFTWARE.
 #include "bns.hpp"
 
 // interpolate data to plot nodes and save to file (one per process)
-void bns_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){
+void bns_t::PlotFields(memory<dfloat>& Q, memory<dfloat>& V, std::string fileName){
 
   FILE *fp;
 
-  fp = fopen(fileName, "w");
+  fp = fopen(fileName.c_str(), "w");
 
   fprintf(fp, "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"BigEndian\">\n");
   fprintf(fp, "  <UnstructuredGrid>\n");
@@ -44,39 +44,45 @@ void bns_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){
   fprintf(fp, "        <DataArray type=\"Float32\" NumberOfComponents=\"3\" Format=\"ascii\">\n");
 
   //scratch space for interpolation
-  size_t NscratchBytes = mymax(mesh.Np, mesh.plotNp)*sizeof(dfloat);
-  dfloat* scratch = (dfloat *) malloc(2*NscratchBytes);
+  size_t Nscratch = std::max(mesh.Np, mesh.plotNp);
+  memory<dfloat> scratch(2*Nscratch);
 
-  dfloat* Ix = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iy = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iz = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
+  memory<dfloat> Ix(mesh.plotNp);
+  memory<dfloat> Iy(mesh.plotNp);
+  memory<dfloat> Iz(mesh.plotNp);
 
   // compute plot node coordinates on the fly
   for(dlong e=0;e<mesh.Nelements;++e){
     mesh.PlotInterp(mesh.x + e*mesh.Np, Ix, scratch);
     mesh.PlotInterp(mesh.y + e*mesh.Np, Iy, scratch);
-    mesh.PlotInterp(mesh.z + e*mesh.Np, Iz, scratch);
+    if(mesh.dim==3)
+      mesh.PlotInterp(mesh.z + e*mesh.Np, Iz, scratch);
 
-    for(int n=0;n<mesh.plotNp;++n){
-      fprintf(fp, "       ");
-      fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],Iz[n]);
+    if (mesh.dim==2) {
+      for(int n=0;n<mesh.plotNp;++n){
+        fprintf(fp, "       ");
+        fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],0.0);
+      }
+    } else {
+      for(int n=0;n<mesh.plotNp;++n){
+        fprintf(fp, "       ");
+        fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],Iz[n]);
+      }
     }
   }
   fprintf(fp, "        </DataArray>\n");
   fprintf(fp, "      </Points>\n");
 
-  free(Ix); free(Iy); free(Iz);
-
-  dfloat* u = (dfloat *) malloc(mesh.Np*sizeof(dfloat));
-  dfloat* v = (dfloat *) malloc(mesh.Np*sizeof(dfloat));
-  dfloat* w = (dfloat *) malloc(mesh.Np*sizeof(dfloat));
+  memory<dfloat> u(mesh.Np);
+  memory<dfloat> v(mesh.Np);
+  memory<dfloat> w(mesh.Np);
 
-  dfloat* Ip = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iu = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iv = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iw = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
+  memory<dfloat> Ip(mesh.plotNp);
+  memory<dfloat> Iu(mesh.plotNp);
+  memory<dfloat> Iv(mesh.plotNp);
+  memory<dfloat> Iw(mesh.plotNp);
 
-  if (Q!=NULL) {
+  if (Q.length()!=0) {
     // write out density
     fprintf(fp, "      <PointData Scalars=\"scalars\">\n");
     fprintf(fp, "        <DataArray type=\"Float32\" Name=\"Density\" Format=\"ascii\">\n");
@@ -130,7 +136,7 @@ void bns_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){
     fprintf(fp, "       </DataArray>\n");
   }
 
-  if (V!=NULL) {
+  if (V.length()!=0) {
     // write out vorticity
     if(mesh.dim==2){
       fprintf(fp, "        <DataArray type=\"Float32\" Name=\"Vorticity\" Format=\"ascii\">\n");
@@ -160,9 +166,6 @@ void bns_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){
   }
   fprintf(fp, "     </PointData>\n");
 
-  free(u); free(v); free(w);
-  free(Ip); free(Iu); free(Iv); free(Iw);
-
   fprintf(fp, "    <Cells>\n");
   fprintf(fp, "      <DataArray type=\"Int32\" Name=\"connectivity\" Format=\"ascii\">\n");
 
@@ -203,6 +206,4 @@ void bns_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){
   fprintf(fp, "  </UnstructuredGrid>\n");
   fprintf(fp, "</VTKFile>\n");
   fclose(fp);
-
-  free(scratch);
 }
diff --git a/solvers/bns/src/bnsPmlSetup.cpp b/solvers/bns/src/bnsPmlSetup.cpp
index 38685c0f9..3d3f8197e 100644
--- a/solvers/bns/src/bnsPmlSetup.cpp
+++ b/solvers/bns/src/bnsPmlSetup.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -108,7 +108,7 @@ void bns_t::PmlSetup(){
     int pmlNp = (pmlcubature) ? mesh.cubNp : mesh.Np;
     int pmlNq = (pmlcubature) ? mesh.cubNq : mesh.Nq;
 
-    dfloat *pmlr, *pmls, *pmlt;
+    memory<dfloat> pmlr, pmls, pmlt;
     if(pmlcubature){
       pmlr = mesh.cubr;
       pmls = mesh.cubs;
@@ -121,27 +121,27 @@ void bns_t::PmlSetup(){
 
     // printf("Setting PML Coefficient \n");
     //set up damping parameter
-    pmlSigma = (dfloat *) calloc(mesh.dim*mesh.NpmlElements*pmlNp,sizeof(dfloat));
+    pmlSigma.malloc(mesh.dim*mesh.NpmlElements*pmlNp, 0.0);
 
     for (dlong m=0;m<mesh.NpmlElements;m++){
       dlong e     = mesh.pmlElements[m];
       hlong type  = mesh.elementInfo[e];
 
       //element vertices
-      const dfloat *xe = mesh.EX + e*mesh.Nverts;
-      const dfloat *ye = mesh.EY + e*mesh.Nverts;
-      const dfloat *ze = mesh.EZ + e*mesh.Nverts;
+      memory<dfloat> xe = mesh.EX + e*mesh.Nverts;
+      memory<dfloat> ye = mesh.EY + e*mesh.Nverts;
+      memory<dfloat> ze = mesh.EZ + e*mesh.Nverts;
 
       for(int n=0;n<pmlNp;++n){ /* for each node */
         dfloat x  = 0, y  = 0, z  = 0;
         dfloat rn = 0, sn = 0, tn = 0;
-        if(mesh.elementType==TRIANGLES){
+        if(mesh.elementType==Mesh::TRIANGLES){
           rn = pmlr[n];
           sn = pmls[n];
 
           x = -0.5*(rn+sn)*xe[0] + 0.5*(1+rn)*xe[1] + 0.5*(1+sn)*xe[2];
           y = -0.5*(rn+sn)*ye[0] + 0.5*(1+rn)*ye[1] + 0.5*(1+sn)*ye[2];
-        } else if(mesh.elementType==QUADRILATERALS){
+        } else if(mesh.elementType==Mesh::QUADRILATERALS){
           const int i = n%pmlNq;
           const int j = n/pmlNq;
           rn = pmlr[i];
@@ -149,7 +149,7 @@ void bns_t::PmlSetup(){
 
           x =  0.25*( (1.0-rn)*(1-sn)*xe[0]+(1.0-rn)*(1+sn)*xe[1]+(1.0+rn)*(1+sn)*xe[2]+(1.0+rn)*(1-sn)*xe[3]);
           y =  0.25*( (1.0-rn)*(1-sn)*ye[0]+(1.0-rn)*(1+sn)*ye[1]+(1.0+rn)*(1+sn)*ye[2]+(1.0+rn)*(1-sn)*ye[3]);
-        } else if(mesh.elementType==TETRAHEDRA){
+        } else if(mesh.elementType==Mesh::TETRAHEDRA){
           rn = pmlr[n];
           sn = pmls[n];
           tn = pmlt[n];
@@ -157,7 +157,7 @@ void bns_t::PmlSetup(){
           x = -0.5*(rn+sn+tn+1)*xe[0] + 0.5*(1+rn)*xe[1] + 0.5*(1+sn)*xe[2] + 0.5*(tn+1)*xe[3];
           y = -0.5*(rn+sn+tn+1)*ye[0] + 0.5*(1+rn)*ye[1] + 0.5*(1+sn)*ye[2] + 0.5*(tn+1)*ye[3];
           z = -0.5*(rn+sn+tn+1)*ze[0] + 0.5*(1+rn)*ze[1] + 0.5*(1+sn)*ze[2] + 0.5*(tn+1)*ze[3];
-        } else if(mesh.elementType==HEXAHEDRA){
+        } else if(mesh.elementType==Mesh::HEXAHEDRA){
           const int i = n%pmlNq;
           const int j = (n/pmlNq)%pmlNq;
           const int k = (n/pmlNq)/pmlNq;
@@ -237,6 +237,6 @@ void bns_t::PmlSetup(){
 
     // printf("# of PML elements: %d and # of Non-PML elements: %d \n",mesh.NpmlElements, mesh.Nelements-mesh.NpmlElements);
     if (mesh.NpmlElements)
-      o_pmlSigma = platform.malloc(mesh.dim*mesh.NpmlElements*pmlNp*sizeof(dfloat),pmlSigma);
+      o_pmlSigma = platform.malloc<dfloat>(pmlSigma);
   }
 }
diff --git a/solvers/bns/src/bnsReport.cpp b/solvers/bns/src/bnsReport.cpp
index 6dae7648c..68e17099b 100644
--- a/solvers/bns/src/bnsReport.cpp
+++ b/solvers/bns/src/bnsReport.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -37,7 +37,7 @@ void bns_t::Report(dfloat time, int tstep){
   mesh.MassMatrixApply(o_q, o_Mq);
 
   dlong Nentries = mesh.Nelements*mesh.Np*Nfields;
-  dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_q, o_Mq, mesh.comm));
+  dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_q, o_Mq, mesh.comm));
 
   if(mesh.rank==0)
     printf("%5.2f (%d), %5.2f (time, timestep, norm)\n", time, tstep, norm2);
@@ -49,12 +49,12 @@ void bns_t::Report(dfloat time, int tstep){
     o_Vort.copyTo(Vort);
 
     // output field files
-    string name;
+    std::string name;
     settings.getSetting("OUTPUT FILE NAME", name);
     char fname[BUFSIZ];
     sprintf(fname, "%s_%04d_%04d.vtu", name.c_str(), mesh.rank, frame++);
 
-    PlotFields(q, Vort, fname);
+    PlotFields(q, Vort, std::string(fname));
   }
 
   /*
diff --git a/solvers/bns/src/bnsRun.cpp b/solvers/bns/src/bnsRun.cpp
index 84ae062e6..342f520dd 100644
--- a/solvers/bns/src/bnsRun.cpp
+++ b/solvers/bns/src/bnsRun.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -51,7 +51,7 @@ void bns_t::Run(){
   dfloat dtAdv  = hmin/(vmax*(mesh.N+1.)*(mesh.N+1.));
   dfloat dtVisc = 1.0/tauInv;
 
-  dfloat dt = (semiAnalytic) ? cfl*dtAdv : cfl*mymin(dtAdv, dtVisc);
+  dfloat dt = (semiAnalytic) ? cfl*dtAdv : cfl*std::min(dtAdv, dtVisc);
   /*
     Artificial warping of time step size for multirate testing
     */
@@ -60,9 +60,9 @@ void bns_t::Run(){
       settings.compareSetting("TIME INTEGRATOR","MRSAAB3"))
     dt /= (1<<(mesh.mrNlevels-1));
 #endif
-  timeStepper->SetTimeStep(dt);
+  timeStepper.SetTimeStep(dt);
 
-  timeStepper->Run(o_q, startTime, finalTime);
+  timeStepper.Run(*this, o_q, startTime, finalTime);
 
   // output norm of final solution
   {
@@ -70,7 +70,7 @@ void bns_t::Run(){
     mesh.MassMatrixApply(o_q, o_Mq);
 
     dlong Nentries = mesh.Nelements*mesh.Np*Nfields;
-    dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_q, o_Mq, mesh.comm));
+    dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_q, o_Mq, mesh.comm));
 
     if(mesh.rank==0)
       printf("Solution norm = %17.15lg\n", norm2);
diff --git a/solvers/bns/src/bnsSettings.cpp b/solvers/bns/src/bnsSettings.cpp
index 6fc2cd10f..cff7fe8e8 100644
--- a/solvers/bns/src/bnsSettings.cpp
+++ b/solvers/bns/src/bnsSettings.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,7 +27,7 @@ SOFTWARE.
 #include "bns.hpp"
 
 //settings for bns solver
-bnsSettings_t::bnsSettings_t(MPI_Comm& _comm):
+bnsSettings_t::bnsSettings_t(comm_t& _comm):
   settings_t(_comm) {
 
   newSetting("DATA FILE",
@@ -95,10 +95,7 @@ bnsSettings_t::bnsSettings_t(MPI_Comm& _comm):
 
 void bnsSettings_t::report() {
 
-  int rank;
-  MPI_Comm_rank(comm, &rank);
-
-  if (rank==0) {
+  if (comm.rank()==0) {
     std::cout << "BNS Settings:\n\n";
     reportSetting("DATA FILE");
     reportSetting("SPEED OF SOUND");
@@ -119,15 +116,15 @@ void bnsSettings_t::report() {
 
 void bnsSettings_t::parseFromFile(platformSettings_t& platformSettings,
                                   meshSettings_t& meshSettings,
-                                  const string filename) {
+                                  const std::string filename) {
   //read all settings from file
   settings_t s(comm);
   s.readSettingsFromFile(filename);
 
   for(auto it = s.settings.begin(); it != s.settings.end(); ++it) {
-    setting_t* set = it->second;
-    const string name = set->getName();
-    const string val = set->getVal<string>();
+    setting_t& set = it->second;
+    const std::string name = set.getName();
+    const std::string val = set.getVal<std::string>();
     if (platformSettings.hasSetting(name))
       platformSettings.changeSetting(name, val);
     else if (meshSettings.hasSetting(name))
@@ -135,9 +132,7 @@ void bnsSettings_t::parseFromFile(platformSettings_t& platformSettings,
     else if (hasSetting(name)) //self
       changeSetting(name, val);
     else  {
-      stringstream ss;
-      ss << "Unknown setting: [" << name << "] requested";
-      LIBP_ABORT(ss.str());
+      LIBP_FORCE_ABORT("Unknown setting: [" << name << "] requested");
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/bns/src/bnsSetup.cpp b/solvers/bns/src/bnsSetup.cpp
index 8831c5e32..842c54104 100644
--- a/solvers/bns/src/bnsSetup.cpp
+++ b/solvers/bns/src/bnsSetup.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,54 +26,60 @@ SOFTWARE.
 
 #include "bns.hpp"
 
-bns_t& bns_t::Setup(platform_t& platform, mesh_t& mesh,
-                    bnsSettings_t& settings){
+void bns_t::Setup(platform_t& _platform, mesh_t& _mesh,
+                  bnsSettings_t& _settings){
 
-  bns_t* bns = new bns_t(platform, mesh, settings);
+  platform = _platform;
+  mesh = _mesh;
+  comm = _mesh.comm;
+  settings = _settings;
 
   //get physical paramters
-  settings.getSetting("SPEED OF SOUND", bns->c);
-  settings.getSetting("VISCOSITY", bns->nu);
-  bns->RT     = bns->c*bns->c;
-  bns->tauInv = bns->RT/bns->nu;
+  settings.getSetting("SPEED OF SOUND", c);
+  settings.getSetting("VISCOSITY", nu);
+  RT     = c*c;
+  tauInv = RT/nu;
 
-  bns->Nfields    = (mesh.dim==3) ? 10:6;
-  bns->Npmlfields = mesh.dim*bns->Nfields;
+  Nfields    = (mesh.dim==3) ? 10:6;
+  Npmlfields = mesh.dim*Nfields;
+
+  //Trigger JIT kernel builds
+  ogs::InitializeKernels(platform, ogs::Dfloat, ogs::Add);
 
   //setup cubature
   mesh.CubatureSetup();
 
   //Setup PML
-  bns->PmlSetup();
+  PmlSetup();
 
   //setup timeStepper
-  dlong Nlocal = mesh.Nelements*mesh.Np*bns->Nfields;
-  dlong Nhalo  = mesh.totalHaloPairs*mesh.Np*bns->Nfields;
+  dlong Nlocal = mesh.Nelements*mesh.Np*Nfields;
+  dlong Nhalo  = mesh.totalHaloPairs*mesh.Np*Nfields;
 
-  bns->semiAnalytic = 0;
+  semiAnalytic = 0;
   if (settings.compareSetting("TIME INTEGRATOR","SARK4")
     ||settings.compareSetting("TIME INTEGRATOR","SARK5")
     ||settings.compareSetting("TIME INTEGRATOR","SAAB3")
     ||settings.compareSetting("TIME INTEGRATOR","MRSAAB3"))
-    bns->semiAnalytic = 1;
+    semiAnalytic = 1;
 
   //semi-analytic exponential coefficients
-  dfloat lambda[bns->Nfields];
+  memory<dfloat> lambda(Nfields);
   for (int i=0;i<mesh.dim+1;i++) lambda[i] = 0.0;
-  for (int i=mesh.dim+1;i<bns->Nfields;i++) lambda[i] = -bns->tauInv;
+  for (int i=mesh.dim+1;i<Nfields;i++) lambda[i] = -tauInv;
 
   //make array of time step estimates for each element
-  dfloat *EtoDT = (dfloat *) calloc(mesh.Nelements,sizeof(dfloat));
-  dfloat vmax = bns->MaxWaveSpeed();
+  memory<dfloat> EtoDT(mesh.Nelements);
+  dfloat vmax = MaxWaveSpeed();
   for(dlong e=0;e<mesh.Nelements;++e){
     dfloat h = mesh.ElementCharacteristicLength(e);
     dfloat dtAdv  = h/(vmax*(mesh.N+1.)*(mesh.N+1.));
-    dfloat dtVisc = 1.0/bns->tauInv;
+    dfloat dtVisc = 1.0/tauInv;
 
-    if (bns->semiAnalytic)
+    if (semiAnalytic)
       EtoDT[e] = dtAdv;
     else
-      EtoDT[e] = mymin(dtAdv, dtVisc);
+      EtoDT[e] = std::min(dtAdv, dtVisc);
 
     /*
     Artificial warping of time step size for multirate testing
@@ -94,7 +100,7 @@ bns_t& bns_t::Setup(platform_t& platform, mesh_t& mesh,
     if (mesh.dim==3)
       c = mymin(c,fabs(z));
 
-    c = mymax(0.5, c);
+    c = std::max(0.5, c);
     EtoDT[e] *= c;
 #endif
   }
@@ -104,183 +110,179 @@ bns_t& bns_t::Setup(platform_t& platform, mesh_t& mesh,
       settings.compareSetting("TIME INTEGRATOR","MRSAAB3")) {
     mesh.MultiRateSetup(EtoDT);
     mesh.MultiRatePmlSetup();
-    bns->multirateTraceHalo = mesh.MultiRateHaloTraceSetup(bns->Nfields);
+    multirateTraceHalo = mesh.MultiRateHaloTraceSetup(Nfields);
   }
 
   if (settings.compareSetting("TIME INTEGRATOR","MRAB3")){
-    bns->timeStepper = new TimeStepper::mrab3_pml(mesh.Nelements, mesh.NpmlElements, mesh.totalHaloPairs,
-                                              mesh.Np, bns->Nfields, bns->Npmlfields, *bns, mesh);
+    timeStepper.Setup<TimeStepper::mrab3_pml>(mesh.Nelements, mesh.NpmlElements,
+                                              mesh.totalHaloPairs,
+                                              mesh.Np, Nfields, Npmlfields,
+                                              platform, mesh);
   } else if (settings.compareSetting("TIME INTEGRATOR","MRSAAB3")){
-    bns->timeStepper = new TimeStepper::mrsaab3_pml(mesh.Nelements, mesh.NpmlElements, mesh.totalHaloPairs,
-                                              mesh.Np, bns->Nfields, bns->Npmlfields, lambda, *bns, mesh);
+    timeStepper.Setup<TimeStepper::mrsaab3_pml>(mesh.Nelements, mesh.NpmlElements,
+                                                mesh.totalHaloPairs,
+                                                mesh.Np, Nfields, Npmlfields,
+                                                lambda, platform, mesh);
   } else if (settings.compareSetting("TIME INTEGRATOR","SAAB3")) {
-    bns->timeStepper = new TimeStepper::saab3_pml(mesh.Nelements, mesh.NpmlElements, mesh.totalHaloPairs,
-                                              mesh.Np, bns->Nfields, bns->Npmlfields, lambda, *bns);
+    timeStepper.Setup<TimeStepper::saab3_pml>(mesh.Nelements, mesh.NpmlElements,
+                                              mesh.totalHaloPairs,
+                                              mesh.Np, Nfields, Npmlfields,
+                                              lambda, platform, comm);
   } else if (settings.compareSetting("TIME INTEGRATOR","AB3")){
-    bns->timeStepper = new TimeStepper::ab3_pml(mesh.Nelements, mesh.NpmlElements, mesh.totalHaloPairs,
-                                              mesh.Np, bns->Nfields, bns->Npmlfields, *bns);
+    timeStepper.Setup<TimeStepper::ab3_pml>(mesh.Nelements, mesh.NpmlElements,
+                                            mesh.totalHaloPairs,
+                                            mesh.Np, Nfields, Npmlfields,
+                                            platform, comm);
   } else if (settings.compareSetting("TIME INTEGRATOR","LSERK4")){
-    bns->timeStepper = new TimeStepper::lserk4_pml(mesh.Nelements, mesh.NpmlElements, mesh.totalHaloPairs,
-                                              mesh.Np, bns->Nfields, bns->Npmlfields, *bns);
+    timeStepper.Setup<TimeStepper::lserk4_pml>(mesh.Nelements, mesh.NpmlElements,
+                                               mesh.totalHaloPairs,
+                                               mesh.Np, Nfields, Npmlfields,
+                                               platform, comm);
   } else if (settings.compareSetting("TIME INTEGRATOR","DOPRI5")){
-    bns->timeStepper = new TimeStepper::dopri5_pml(mesh.Nelements, mesh.NpmlElements, mesh.totalHaloPairs,
-                                              mesh.Np, bns->Nfields, bns->Npmlfields, *bns, mesh.comm);
+    timeStepper.Setup<TimeStepper::dopri5_pml>(mesh.Nelements, mesh.NpmlElements,
+                                               mesh.totalHaloPairs,
+                                               mesh.Np, Nfields, Npmlfields,
+                                               platform, comm);
   } else if (settings.compareSetting("TIME INTEGRATOR","SARK4")) {
-    bns->timeStepper = new TimeStepper::sark4_pml(mesh.Nelements, mesh.NpmlElements, mesh.totalHaloPairs,
-                                              mesh.Np, bns->Nfields, bns->Npmlfields, lambda, *bns, mesh.comm);
+    timeStepper.Setup<TimeStepper::sark4_pml>(mesh.Nelements, mesh.NpmlElements,
+                                              mesh.totalHaloPairs,
+                                              mesh.Np, Nfields, Npmlfields,
+                                              lambda, platform, comm);
   } else if (settings.compareSetting("TIME INTEGRATOR","SARK5")) {
-    bns->timeStepper = new TimeStepper::sark5_pml(mesh.Nelements, mesh.NpmlElements, mesh.totalHaloPairs,
-                                              mesh.Np, bns->Nfields, bns->Npmlfields, lambda, *bns, mesh.comm);
+    timeStepper.Setup<TimeStepper::sark5_pml>(mesh.Nelements, mesh.NpmlElements,
+                                              mesh.totalHaloPairs,
+                                              mesh.Np, Nfields, Npmlfields,
+                                              lambda, platform, comm);
   } else {
-    LIBP_ABORT(string("Requested TIME INTEGRATOR not found."));
+    LIBP_FORCE_ABORT("Requested TIME INTEGRATOR not found.");
   }
-  free(EtoDT);
 
   //setup linear algebra module
-  platform.linAlg.InitKernels({"innerProd"});
+  platform.linAlg().InitKernels({"innerProd"});
 
   /*setup trace halo exchange */
-  bns->traceHalo = mesh.HaloTraceSetup(bns->Nfields);
+  traceHalo = mesh.HaloTraceSetup(Nfields);
 
   // compute samples of q at interpolation nodes
-  bns->q = (dfloat*) calloc(Nlocal+Nhalo, sizeof(dfloat));
-  bns->o_q = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), bns->q);
+  q.malloc(Nlocal+Nhalo, 0.0);
+  o_q = platform.malloc<dfloat>(q);
 
-  bns->Vort = (dfloat*) calloc(mesh.dim*mesh.Nelements*mesh.Np, sizeof(dfloat));
-  bns->o_Vort = platform.malloc((mesh.dim*mesh.Nelements*mesh.Np)*sizeof(dfloat),
-                                              bns->Vort);
+  Vort.malloc(mesh.dim*mesh.Nelements*mesh.Np, 0.0);
+  o_Vort = platform.malloc<dfloat>(Vort);
 
   //storage for M*q during reporting
-  bns->o_Mq = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), bns->q);
-  mesh.MassMatrixKernelSetup(bns->Nfields); // mass matrix operator
+  o_Mq = platform.malloc<dfloat>(q);
+  mesh.MassMatrixKernelSetup(Nfields); // mass matrix operator
 
   // OCCA build stuff
-  occa::properties kernelInfo = mesh.props; //copy base occa properties
+  properties_t kernelInfo = mesh.props; //copy base occa properties
 
   //add boundary data to kernel info
-  string dataFileName;
+  std::string dataFileName;
   settings.getSetting("DATA FILE", dataFileName);
   kernelInfo["includes"] += dataFileName;
 
-  kernelInfo["defines/" "p_Nfields"]= bns->Nfields;
-  kernelInfo["defines/" "p_Npmlfields"]= bns->Npmlfields;
+  kernelInfo["defines/" "p_Nfields"]= Nfields;
+  kernelInfo["defines/" "p_Npmlfields"]= Npmlfields;
 
-  int maxNodes = mymax(mesh.Np, (mesh.Nfp*mesh.Nfaces));
+  int maxNodes = std::max(mesh.Np, (mesh.Nfp*mesh.Nfaces));
   kernelInfo["defines/" "p_maxNodes"]= maxNodes;
 
   int blockMax = 256;
   if (platform.device.mode()=="CUDA") blockMax = 512;
 
-  int NblockV = mymax(1, blockMax/mesh.Np);
+  int NblockV = std::max(1, blockMax/mesh.Np);
   kernelInfo["defines/" "p_NblockV"]= NblockV;
 
-  int NblockS = mymax(1, blockMax/maxNodes);
+  int NblockS = std::max(1, blockMax/maxNodes);
   kernelInfo["defines/" "p_NblockS"]= NblockS;
 
-  int NblockCub = mymax(1, blockMax/mesh.cubNp);
+  int NblockCub = std::max(1, blockMax/mesh.cubNp);
   kernelInfo["defines/" "p_NblockCub"]= NblockCub;
 
-  kernelInfo["parser/" "automate-add-barriers"] =  "disabled";
-
   // set kernel name suffix
-  char *suffix;
-  if(mesh.elementType==TRIANGLES)
-    suffix = strdup("Tri2D");
-  if(mesh.elementType==QUADRILATERALS)
-    suffix = strdup("Quad2D");
-  if(mesh.elementType==TETRAHEDRA)
-    suffix = strdup("Tet3D");
-  if(mesh.elementType==HEXAHEDRA)
-    suffix = strdup("Hex3D");
-
-  char fileName[BUFSIZ], kernelName[BUFSIZ];
+  std::string suffix;
+  if(mesh.elementType==Mesh::TRIANGLES)
+    suffix = "Tri2D";
+  if(mesh.elementType==Mesh::QUADRILATERALS)
+    suffix = "Quad2D";
+  if(mesh.elementType==Mesh::TETRAHEDRA)
+    suffix = "Tet3D";
+  if(mesh.elementType==Mesh::HEXAHEDRA)
+    suffix = "Hex3D";
+
+  std::string oklFilePrefix = DBNS "/okl/";
+  std::string oklFileSuffix = ".okl";
+
+  std::string fileName, kernelName;
 
   // kernels from volume file
-  sprintf(fileName, DBNS "/okl/bnsVolume%s.okl", suffix);
-  sprintf(kernelName, "bnsVolume%s", suffix);
-  bns->volumeKernel =  platform.buildKernel(fileName, kernelName,
+  fileName   = oklFilePrefix + "bnsVolume" + suffix + oklFileSuffix;
+  kernelName = "bnsVolume" + suffix;
+  volumeKernel =  platform.buildKernel(fileName, kernelName,
                                          kernelInfo);
 
-  if (bns->pmlcubature) {
-    sprintf(kernelName, "bnsPmlVolumeCub%s", suffix);
-    bns->pmlVolumeKernel =  platform.buildKernel(fileName, kernelName,
+  if (pmlcubature) {
+    kernelName = "bnsPmlVolumeCub" + suffix;
+    pmlVolumeKernel =  platform.buildKernel(fileName, kernelName,
                                          kernelInfo);
   } else {
-    sprintf(kernelName, "bnsPmlVolume%s", suffix);
-    bns->pmlVolumeKernel =  platform.buildKernel(fileName, kernelName,
+    kernelName = "bnsPmlVolume" + suffix;
+    pmlVolumeKernel =  platform.buildKernel(fileName, kernelName,
                                          kernelInfo);
   }
 
   // kernels from relaxation file
-  sprintf(fileName, DBNS "/okl/bnsRelaxation%s.okl", suffix);
-  sprintf(kernelName, "bnsRelaxation%s", suffix);
-  bns->relaxationKernel = platform.buildKernel(fileName, kernelName,
+  fileName   = oklFilePrefix + "bnsRelaxation" + suffix + oklFileSuffix;
+  kernelName = "bnsRelaxation" + suffix;
+  relaxationKernel = platform.buildKernel(fileName, kernelName,
                                          kernelInfo);
-  if (bns->pmlcubature) {
-    sprintf(kernelName, "bnsPmlRelaxationCub%s", suffix);
-    bns->pmlRelaxationKernel = platform.buildKernel(fileName, kernelName,
+  if (pmlcubature) {
+    kernelName = "bnsPmlRelaxationCub" + suffix;
+    pmlRelaxationKernel = platform.buildKernel(fileName, kernelName,
                                            kernelInfo);
   } else {
-    bns->pmlRelaxationKernel = platform.buildKernel(fileName, kernelName,
+    pmlRelaxationKernel = platform.buildKernel(fileName, kernelName,
                                            kernelInfo);
   }
 
 
   // kernels from surface file
-  sprintf(fileName, DBNS "/okl/bnsSurface%s.okl", suffix);
+  fileName   = oklFilePrefix + "bnsSurface" + suffix + oklFileSuffix;
   if (settings.compareSetting("TIME INTEGRATOR","MRAB3") ||
       settings.compareSetting("TIME INTEGRATOR","MRSAAB3")) {
-    sprintf(kernelName, "bnsMRSurface%s", suffix);
-    bns->surfaceKernel = platform.buildKernel(fileName, kernelName,
+    kernelName = "bnsMRSurface" + suffix;
+    surfaceKernel = platform.buildKernel(fileName, kernelName,
                                            kernelInfo);
 
-    sprintf(kernelName, "bnsMRPmlSurface%s", suffix);
-    bns->pmlSurfaceKernel = platform.buildKernel(fileName, kernelName,
+    kernelName = "bnsMRPmlSurface" + suffix;
+    pmlSurfaceKernel = platform.buildKernel(fileName, kernelName,
                                            kernelInfo);
   } else {
-    sprintf(kernelName, "bnsSurface%s", suffix);
-    bns->surfaceKernel = platform.buildKernel(fileName, kernelName,
+    kernelName = "bnsSurface" + suffix;
+    surfaceKernel = platform.buildKernel(fileName, kernelName,
                                            kernelInfo);
 
-    sprintf(kernelName, "bnsPmlSurface%s", suffix);
-    bns->pmlSurfaceKernel = platform.buildKernel(fileName, kernelName,
+    kernelName = "bnsPmlSurface" + suffix;
+    pmlSurfaceKernel = platform.buildKernel(fileName, kernelName,
                                            kernelInfo);
   }
 
   // vorticity calculation
-  sprintf(fileName, DBNS "/okl/bnsVorticity%s.okl", suffix);
-  sprintf(kernelName, "bnsVorticity%s", suffix);
+  fileName   = oklFilePrefix + "bnsVorticity" + suffix + oklFileSuffix;
+  kernelName = "bnsVorticity" + suffix;
 
-  bns->vorticityKernel = platform.buildKernel(fileName, kernelName,
+  vorticityKernel = platform.buildKernel(fileName, kernelName,
                                      kernelInfo);
 
   if (mesh.dim==2) {
-    sprintf(fileName, DBNS "/okl/bnsInitialCondition2D.okl");
-    sprintf(kernelName, "bnsInitialCondition2D");
+    fileName   = oklFilePrefix + "bnsInitialCondition2D" + oklFileSuffix;
+    kernelName = "bnsInitialCondition2D";
   } else {
-    sprintf(fileName, DBNS "/okl/bnsInitialCondition3D.okl");
-    sprintf(kernelName, "bnsInitialCondition3D");
+    fileName   = oklFilePrefix + "bnsInitialCondition3D" + oklFileSuffix;
+    kernelName = "bnsInitialCondition3D";
   }
 
-  bns->initialConditionKernel = platform.buildKernel(fileName, kernelName,
+  initialConditionKernel = platform.buildKernel(fileName, kernelName,
                                             kernelInfo);
-
-  return *bns;
 }
-
-bns_t::~bns_t() {
-  volumeKernel.free();
-  surfaceKernel.free();
-  relaxationKernel.free();
-  pmlVolumeKernel.free();
-  pmlSurfaceKernel.free();
-  pmlRelaxationKernel.free();
-  vorticityKernel.free();
-  initialConditionKernel.free();
-
-  if (timeStepper) delete timeStepper;
-  if (traceHalo) traceHalo->Free();
-
-  for (int lev=0;lev<mesh.mrNlevels;lev++)
-    if (multirateTraceHalo[lev]) multirateTraceHalo[lev]->Free();
-}
\ No newline at end of file
diff --git a/solvers/bns/src/bnsStep.cpp b/solvers/bns/src/bnsStep.cpp
index 40a6fa611..cfd9eb34c 100644
--- a/solvers/bns/src/bnsStep.cpp
+++ b/solvers/bns/src/bnsStep.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -32,11 +32,11 @@ dfloat bns_t::MaxWaveSpeed(){
 }
 
 //evaluate ODE rhs = f(q,t)
-void bns_t::rhsf_pml(occa::memory& o_Q, occa::memory& o_pmlQ,
-                     occa::memory& o_RHS, occa::memory& o_pmlRHS, const dfloat T){
+void bns_t::rhsf_pml(deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_pmlQ,
+                     deviceMemory<dfloat>& o_RHS, deviceMemory<dfloat>& o_pmlRHS, const dfloat T){
 
   // extract q trace halo and start exchange
-  traceHalo->ExchangeStart(o_Q, 1, ogs_dfloat);
+  traceHalo.ExchangeStart(o_Q, 1);
 
   // compute volume contribution to bns RHS
   rhsVolume(mesh.NnonPmlElements, mesh.o_nonPmlElements, o_Q, o_RHS, T);
@@ -49,7 +49,7 @@ void bns_t::rhsf_pml(occa::memory& o_Q, occa::memory& o_pmlQ,
                    o_Q, o_pmlQ, o_RHS, o_pmlRHS);
 
   // complete trace halo exchange
-  traceHalo->ExchangeFinish(o_Q, 1, ogs_dfloat);
+  traceHalo.ExchangeFinish(o_Q, 1);
 
   // compute surface contribution to bns RHS
   rhsSurface(mesh.NnonPmlElements, mesh.o_nonPmlElements, o_Q, o_RHS, T);
@@ -59,12 +59,12 @@ void bns_t::rhsf_pml(occa::memory& o_Q, occa::memory& o_pmlQ,
 
 
 //evaluate ODE rhs = f(q,t)
-void bns_t::rhsf_MR_pml(occa::memory& o_Q, occa::memory& o_pmlQ,
-                        occa::memory& o_RHS, occa::memory& o_pmlRHS,
-                        occa::memory& o_fQM, const dfloat T, const int lev){
+void bns_t::rhsf_MR_pml(deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_pmlQ,
+                        deviceMemory<dfloat>& o_RHS, deviceMemory<dfloat>& o_pmlRHS,
+                        deviceMemory<dfloat>& o_fQM, const dfloat T, const int lev){
 
   // extract q trace halo and start exchange
-  multirateTraceHalo[lev]->ExchangeStart(o_fQM, 1, ogs_dfloat);
+  multirateTraceHalo[lev].ExchangeStart(o_fQM, 1);
 
   // compute volume contribution to bns RHS
   rhsVolume(mesh.mrNnonPmlElements[lev], mesh.o_mrNonPmlElements[lev], o_Q, o_RHS, T);
@@ -77,7 +77,7 @@ void bns_t::rhsf_MR_pml(occa::memory& o_Q, occa::memory& o_pmlQ,
                    o_Q, o_pmlQ, o_RHS, o_pmlRHS);
 
   // complete trace halo exchange
-  multirateTraceHalo[lev]->ExchangeFinish(o_fQM, 1, ogs_dfloat);
+  multirateTraceHalo[lev].ExchangeFinish(o_fQM, 1);
 
   // compute surface contribution to bns RHS
   rhsSurfaceMR(mesh.mrNnonPmlElements[lev], mesh.o_mrNonPmlElements[lev], o_Q, o_RHS, o_fQM, T);
@@ -85,8 +85,8 @@ void bns_t::rhsf_MR_pml(occa::memory& o_Q, occa::memory& o_pmlQ,
                   o_Q, o_pmlQ, o_RHS, o_pmlRHS, o_fQM, T);
 }
 
-void bns_t::rhsVolume(dlong N, occa::memory& o_ids,
-                      occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){
+void bns_t::rhsVolume(dlong N, deviceMemory<dlong>& o_ids,
+                      deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS, const dfloat T){
 
   // compute volume contribution to bns RHS
   if (N)
@@ -104,9 +104,9 @@ void bns_t::rhsVolume(dlong N, occa::memory& o_ids,
                  o_RHS);
 }
 
-void bns_t::rhsPmlVolume(dlong N, occa::memory& o_ids, occa::memory& o_pmlids,
-                         occa::memory& o_Q, occa::memory& o_pmlQ,
-                         occa::memory& o_RHS, occa::memory& o_pmlRHS, const dfloat T){
+void bns_t::rhsPmlVolume(dlong N, deviceMemory<dlong>& o_ids, deviceMemory<dlong>& o_pmlids,
+                         deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_pmlQ,
+                         deviceMemory<dfloat>& o_RHS, deviceMemory<dfloat>& o_pmlRHS, const dfloat T){
 
   // compute volume contribution to bns RHS
   if (N) {
@@ -146,8 +146,8 @@ void bns_t::rhsPmlVolume(dlong N, occa::memory& o_ids, occa::memory& o_pmlids,
   }
 }
 
-void bns_t::rhsRelaxation(dlong N, occa::memory& o_ids,
-                          occa::memory& o_Q, occa::memory& o_RHS){
+void bns_t::rhsRelaxation(dlong N, deviceMemory<dlong>& o_ids,
+                          deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS){
 
   // compute volume contribution to bns RHS
   if (N)
@@ -163,9 +163,9 @@ void bns_t::rhsRelaxation(dlong N, occa::memory& o_ids,
                      o_RHS);
 }
 
-void bns_t::rhsPmlRelaxation(dlong N, occa::memory& o_ids, occa::memory& o_pmlids,
-                             occa::memory& o_Q, occa::memory& o_pmlQ,
-                             occa::memory& o_RHS, occa::memory& o_pmlRHS){
+void bns_t::rhsPmlRelaxation(dlong N, deviceMemory<dlong>& o_ids, deviceMemory<dlong>& o_pmlids,
+                             deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_pmlQ,
+                             deviceMemory<dfloat>& o_RHS, deviceMemory<dfloat>& o_pmlRHS){
 
   // compute volume contribution to bns RHS
   if (N) {
@@ -199,8 +199,8 @@ void bns_t::rhsPmlRelaxation(dlong N, occa::memory& o_ids, occa::memory& o_pmlid
   }
 }
 
-void bns_t::rhsSurface(dlong N, occa::memory& o_ids,
-                      occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){
+void bns_t::rhsSurface(dlong N, deviceMemory<dlong>& o_ids,
+                      deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS, const dfloat T){
 
   // compute volume contribution to bns RHS
   if (N)
@@ -221,9 +221,9 @@ void bns_t::rhsSurface(dlong N, occa::memory& o_ids,
                   o_RHS);
 }
 
-void bns_t::rhsPmlSurface(dlong N, occa::memory& o_ids, occa::memory& o_pmlids,
-                         occa::memory& o_Q, occa::memory& o_pmlQ,
-                         occa::memory& o_RHS, occa::memory& o_pmlRHS, const dfloat T){
+void bns_t::rhsPmlSurface(dlong N, deviceMemory<dlong>& o_ids, deviceMemory<dlong>& o_pmlids,
+                         deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_pmlQ,
+                         deviceMemory<dfloat>& o_RHS, deviceMemory<dfloat>& o_pmlRHS, const dfloat T){
 
   // compute volume contribution to bns RHS
   if (N)
@@ -246,9 +246,9 @@ void bns_t::rhsPmlSurface(dlong N, occa::memory& o_ids, occa::memory& o_pmlids,
                     o_pmlRHS);
 }
 
-void bns_t::rhsSurfaceMR(dlong N, occa::memory& o_ids,
-                         occa::memory& o_Q, occa::memory& o_RHS,
-                         occa::memory& o_fQM, const dfloat T){
+void bns_t::rhsSurfaceMR(dlong N, deviceMemory<dlong>& o_ids,
+                         deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS,
+                         deviceMemory<dfloat>& o_fQM, const dfloat T){
 
   // compute volume contribution to bns RHS
   if (N)
@@ -270,10 +270,10 @@ void bns_t::rhsSurfaceMR(dlong N, occa::memory& o_ids,
                   o_RHS);
 }
 
-void bns_t::rhsPmlSurfaceMR(dlong N, occa::memory& o_ids, occa::memory& o_pmlids,
-                         occa::memory& o_Q, occa::memory& o_pmlQ,
-                         occa::memory& o_RHS, occa::memory& o_pmlRHS,
-                         occa::memory& o_fQM, const dfloat T){
+void bns_t::rhsPmlSurfaceMR(dlong N, deviceMemory<dlong>& o_ids, deviceMemory<dlong>& o_pmlids,
+                         deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_pmlQ,
+                         deviceMemory<dfloat>& o_RHS, deviceMemory<dfloat>& o_pmlRHS,
+                         deviceMemory<dfloat>& o_fQM, const dfloat T){
 
   // compute volume contribution to bns RHS
   if (N)
diff --git a/solvers/cns/cns.hpp b/solvers/cns/cns.hpp
index 92c949faf..ae54e9a3f 100644
--- a/solvers/cns/cns.hpp
+++ b/solvers/cns/cns.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -36,18 +36,20 @@ SOFTWARE.
 
 #define DCNS LIBP_DIR"/solvers/cns/"
 
+using namespace libp;
+
 class cnsSettings_t: public settings_t {
 public:
-  cnsSettings_t(MPI_Comm& _comm);
+  cnsSettings_t(comm_t _comm);
   void report();
   void parseFromFile(platformSettings_t& platformSettings,
                      meshSettings_t& meshSettings,
-                     const string filename);
+                     const std::string filename);
 };
 
 class cns_t: public solver_t {
 public:
-  mesh_t &mesh;
+  mesh_t mesh;
 
   int Nfields;
   int Ngrads;
@@ -58,57 +60,56 @@ class cns_t: public solver_t {
   int cubature;
   int isothermal;
 
-  TimeStepper::timeStepper_t* timeStepper;
+  timeStepper_t timeStepper;
 
-  halo_t* fieldTraceHalo;
-  halo_t* gradTraceHalo;
+  ogs::halo_t fieldTraceHalo;
+  ogs::halo_t gradTraceHalo;
 
-  dfloat *q;
-  occa::memory o_q;
+  memory<dfloat> q;
+  deviceMemory<dfloat> o_q;
 
-  dfloat *gradq;
-  occa::memory o_gradq;
+  memory<dfloat> gradq;
+  deviceMemory<dfloat> o_gradq;
 
-  dfloat *Vort;
-  occa::memory o_Vort;
+  memory<dfloat> Vort;
+  deviceMemory<dfloat> o_Vort;
 
-  occa::memory o_Mq;
+  deviceMemory<dfloat> o_Mq;
 
-  occa::kernel volumeKernel;
-  occa::kernel surfaceKernel;
-  occa::kernel cubatureVolumeKernel;
-  occa::kernel cubatureSurfaceKernel;
+  kernel_t volumeKernel;
+  kernel_t surfaceKernel;
+  kernel_t cubatureVolumeKernel;
+  kernel_t cubatureSurfaceKernel;
 
-  occa::kernel gradVolumeKernel;
-  occa::kernel gradSurfaceKernel;
+  kernel_t gradVolumeKernel;
+  kernel_t gradSurfaceKernel;
 
-  occa::kernel vorticityKernel;
+  kernel_t vorticityKernel;
 
-  occa::kernel constrainKernel;
+  kernel_t constrainKernel;
 
-  occa::kernel initialConditionKernel;
-  occa::kernel maxWaveSpeedKernel;
+  kernel_t initialConditionKernel;
+  kernel_t maxWaveSpeedKernel;
 
-  cns_t() = delete;
+  cns_t() = default;
   cns_t(platform_t &_platform, mesh_t &_mesh,
-              cnsSettings_t& _settings):
-    solver_t(_platform, _settings), mesh(_mesh) {}
-
-  ~cns_t();
+              cnsSettings_t& _settings) {
+    Setup(_platform, _mesh, _settings);
+  }
 
   //setup
-  static cns_t& Setup(platform_t& platform, mesh_t& mesh,
-                      cnsSettings_t& settings);
+  void Setup(platform_t& _platform, mesh_t& _mesh,
+             cnsSettings_t& _settings);
 
   void Run();
 
-  void Report(dfloat time, int tstep);
+  void Report(dfloat time, int tstep) override;
 
-  void PlotFields(dfloat* Q, dfloat *V, char *fileName);
+  void PlotFields(memory<dfloat> Q, memory<dfloat> V, std::string fileName);
 
-  void rhsf(occa::memory& o_q, occa::memory& o_rhs, const dfloat time);
+  void rhsf(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_rhs, const dfloat time);
 
-  dfloat MaxWaveSpeed(occa::memory& o_Q, const dfloat T);
+  dfloat MaxWaveSpeed(deviceMemory<dfloat>& o_Q, const dfloat T);
 };
 
 #endif
diff --git a/solvers/cns/cnsMain.cpp b/solvers/cns/cnsMain.cpp
index 9fc888216..d028762fa 100644
--- a/solvers/cns/cnsMain.cpp
+++ b/solvers/cns/cnsMain.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -29,39 +29,40 @@ SOFTWARE.
 int main(int argc, char **argv){
 
   // start up MPI
-  MPI_Init(&argc, &argv);
+  Comm::Init(argc, argv);
 
-  MPI_Comm comm = MPI_COMM_WORLD;
+  LIBP_ABORT("Usage: ./cnsMain setupfile", argc!=2);
 
-  if(argc!=2)
-    LIBP_ABORT(string("Usage: ./cnsMain setupfile"));
+  { /*Scope so everything is destructed before MPI_Finalize */
+    comm_t comm(Comm::World().Dup());
 
-  //create default settings
-  platformSettings_t platformSettings(comm);
-  meshSettings_t meshSettings(comm);
-  cnsSettings_t cnsSettings(comm);
+    //create default settings
+    platformSettings_t platformSettings(comm);
+    meshSettings_t meshSettings(comm);
+    cnsSettings_t cnsSettings(comm);
 
-  //load settings from file
-  cnsSettings.parseFromFile(platformSettings, meshSettings,
-                            argv[1]);
+    //load settings from file
+    cnsSettings.parseFromFile(platformSettings, meshSettings,
+                              argv[1]);
 
-  // set up platform
-  platform_t platform(platformSettings);
+    // set up platform
+    platform_t platform(platformSettings);
 
-  platformSettings.report();
-  meshSettings.report();
-  cnsSettings.report();
+    platformSettings.report();
+    meshSettings.report();
+    cnsSettings.report();
 
-  // set up mesh
-  mesh_t& mesh = mesh_t::Setup(platform, meshSettings, comm);
+    // set up mesh
+    mesh_t mesh(platform, meshSettings, comm);
 
-  // set up cns solver
-  cns_t& cns = cns_t::Setup(platform, mesh, cnsSettings);
+    // set up cns solver
+    cns_t cns(platform, mesh, cnsSettings);
 
-  // run
-  cns.Run();
+    // run
+    cns.Run();
+  }
 
   // close down MPI
-  MPI_Finalize();
+  Comm::Finalize();
   return LIBP_SUCCESS;
 }
diff --git a/solvers/cns/data/cnsGaussian2D.h b/solvers/cns/data/cnsGaussian2D.h
index 48cf506d1..bc247cfe6 100644
--- a/solvers/cns/data/cnsGaussian2D.h
+++ b/solvers/cns/data/cnsGaussian2D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/cns/data/cnsGaussian3D.h b/solvers/cns/data/cnsGaussian3D.h
index 1920900c5..74417fd32 100644
--- a/solvers/cns/data/cnsGaussian3D.h
+++ b/solvers/cns/data/cnsGaussian3D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/cns/data/cnsUniform2D.h b/solvers/cns/data/cnsUniform2D.h
index 9123fedee..05a37ec0c 100644
--- a/solvers/cns/data/cnsUniform2D.h
+++ b/solvers/cns/data/cnsUniform2D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/cns/data/cnsUniform3D.h b/solvers/cns/data/cnsUniform3D.h
index 0100058f1..15de1588f 100644
--- a/solvers/cns/data/cnsUniform3D.h
+++ b/solvers/cns/data/cnsUniform3D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/cns/data/cnsVortexDipole2D.h b/solvers/cns/data/cnsVortexDipole2D.h
index 0e31fdd50..cef94933d 100644
--- a/solvers/cns/data/cnsVortexDipole2D.h
+++ b/solvers/cns/data/cnsVortexDipole2D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/cns/makefile b/solvers/cns/makefile
index 09a369c09..3fbe22f42 100644
--- a/solvers/cns/makefile
+++ b/solvers/cns/makefile
@@ -2,7 +2,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
@@ -78,11 +78,8 @@ include ../../make.top
 endif
 endif
 
-#gslib
-GS_DIR=${LIBP_TPL_DIR}/gslib
-
 #libraries
-CNS_LIBP_LIBS=timeStepper mesh ogs linAlg core
+CNS_LIBP_LIBS=timeStepper mesh parAdogs ogs linAlg core
 
 #includes
 INCLUDES=${LIBP_INCLUDES} \
@@ -93,11 +90,10 @@ DEFINES =${LIBP_DEFINES} \
          -DLIBP_DIR='"${LIBP_DIR}"'
 
 #.cpp compilation flags
-CNS_CXXFLAGS=${LIBP_MPICXXFLAGS} ${DEFINES} ${INCLUDES}
+CNS_CXXFLAGS=${LIBP_CXXFLAGS} ${DEFINES} ${INCLUDES}
 
 #link libraries
 LIBS=-L${LIBP_LIBS_DIR} $(addprefix -l,$(CNS_LIBP_LIBS)) \
-     -L$(GS_DIR)/lib -lgs \
      ${LIBP_LIBS}
 
 #link flags
@@ -145,10 +141,10 @@ endif
 # rule for .cpp files
 %.o: %.cpp $(DEPS) | libp_libs
 ifneq (,${verbose})
-	$(LIBP_MPICXX) -o $*.o -c $*.cpp $(CNS_CXXFLAGS)
+	$(LIBP_CXX) -o $*.o -c $*.cpp $(CNS_CXXFLAGS)
 else
 	@printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n";
-	@$(LIBP_MPICXX) -o $*.o -c $*.cpp $(CNS_CXXFLAGS)
+	@$(LIBP_CXX) -o $*.o -c $*.cpp $(CNS_CXXFLAGS)
 endif
 
 #cleanup
@@ -159,8 +155,7 @@ clean-libs: clean
 	${MAKE} -C ${LIBP_LIBS_DIR} clean
 
 clean-kernels: clean-libs
-# 	$(shell ${OCCA_DIR}/bin/occa clear all -y)
-	rm -rf ~/.occa/
+	rm -rf ${LIBP_DIR}/.occa/
 
 realclean: clean
 	${MAKE} -C ${LIBP_LIBS_DIR} realclean
diff --git a/solvers/cns/okl/cnsConstrainQuad3D.okl b/solvers/cns/okl/cnsConstrainQuad3D.okl
index fa78d6ca5..b99a6dd3b 100644
--- a/solvers/cns/okl/cnsConstrainQuad3D.okl
+++ b/solvers/cns/okl/cnsConstrainQuad3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,34 +26,34 @@ SOFTWARE.
 
 
 @kernel void cnsConstrainQuad3D(const dlong Nelements,
-				@restrict const  dfloat *  x,
-				@restrict const  dfloat *  y,
-				@restrict const  dfloat *  z,
-				@restrict dfloat *  rhsq){
+                                @restrict const  dfloat *  x,
+                                @restrict const  dfloat *  y,
+                                @restrict const  dfloat *  z,
+                                @restrict dfloat *  rhsq){
 
   for(dlong e=0;e<Nelements;++e;@outer(0)){
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
-	
+
         const dlong  base = e*p_Np*p_Nfields + j*p_Nq + i;
 
-	const dfloat xij = x[i+j*p_Nq+e*p_Np];
-	const dfloat yij = y[i+j*p_Nq+e*p_Np];
-	const dfloat zij = z[i+j*p_Nq+e*p_Np];
-
-	dfloat rhsq1 = rhsq[base+1*p_Np];
-	dfloat rhsq2 = rhsq[base+2*p_Np];
-	dfloat rhsq3 = rhsq[base+3*p_Np];
-	
-	const dfloat xdotrhsq = (rhsq1*xij + rhsq2*yij + rhsq3*zij)*p_invRadiusSq;
-	rhsq1 -= xij*xdotrhsq;
-	rhsq2 -= yij*xdotrhsq;
-	rhsq3 -= zij*xdotrhsq;
-	
-	rhsq[base+1*p_Np] = rhsq1;
-	rhsq[base+2*p_Np] = rhsq2;
-	rhsq[base+3*p_Np] = rhsq3;
+        const dfloat xij = x[i+j*p_Nq+e*p_Np];
+        const dfloat yij = y[i+j*p_Nq+e*p_Np];
+        const dfloat zij = z[i+j*p_Nq+e*p_Np];
+
+        dfloat rhsq1 = rhsq[base+1*p_Np];
+        dfloat rhsq2 = rhsq[base+2*p_Np];
+        dfloat rhsq3 = rhsq[base+3*p_Np];
+
+        const dfloat xdotrhsq = (rhsq1*xij + rhsq2*yij + rhsq3*zij)*p_invRadiusSq;
+        rhsq1 -= xij*xdotrhsq;
+        rhsq2 -= yij*xdotrhsq;
+        rhsq3 -= zij*xdotrhsq;
+
+        rhsq[base+1*p_Np] = rhsq1;
+        rhsq[base+2*p_Np] = rhsq2;
+        rhsq[base+3*p_Np] = rhsq3;
       }
     }
   }
diff --git a/solvers/cns/okl/cnsCubatureSurfaceHex3D.okl b/solvers/cns/okl/cnsCubatureSurfaceHex3D.okl
index d50d46b13..068de9e94 100644
--- a/solvers/cns/okl/cnsCubatureSurfaceHex3D.okl
+++ b/solvers/cns/okl/cnsCubatureSurfaceHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -127,8 +127,6 @@ void upwindRoeAveraged(const dfloat nx,
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
-                                                                        \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
         if (j<p_Nq) {                                                   \
@@ -156,7 +154,6 @@ void upwindRoeAveraged(const dfloat nx,
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -173,7 +170,6 @@ void upwindRoeAveraged(const dfloat nx,
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -199,7 +195,6 @@ void upwindRoeAveraged(const dfloat nx,
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -296,7 +291,6 @@ void upwindRoeAveraged(const dfloat nx,
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -314,7 +308,6 @@ void upwindRoeAveraged(const dfloat nx,
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -325,7 +318,6 @@ void upwindRoeAveraged(const dfloat nx,
         }                                                               \
       }                                                                 \
     }                                                                   \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -343,7 +335,6 @@ void upwindRoeAveraged(const dfloat nx,
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -416,11 +407,9 @@ void upwindRoeAveraged(const dfloat nx,
       }
     }
 
-    @barrier("local");
 
     quadSurfaceTerms(0) //face 0
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -433,11 +422,9 @@ void upwindRoeAveraged(const dfloat nx,
       }
     }
 
-    @barrier("local");
 
     quadSurfaceTerms(5) //face 5
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -450,11 +437,9 @@ void upwindRoeAveraged(const dfloat nx,
       }
     }
 
-    @barrier("local");
 
     quadSurfaceTerms(1) //face 1
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -470,11 +455,9 @@ void upwindRoeAveraged(const dfloat nx,
       }
     }
 
-    @barrier("local");
 
     quadSurfaceTerms(3) //face 3
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -490,11 +473,9 @@ void upwindRoeAveraged(const dfloat nx,
       }
     }
 
-    @barrier("local");
 
     quadSurfaceTerms(2) //face 2
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -510,11 +491,9 @@ void upwindRoeAveraged(const dfloat nx,
       }
     }
 
-    @barrier("local");
 
     quadSurfaceTerms(4) //face 4
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -530,7 +509,6 @@ void upwindRoeAveraged(const dfloat nx,
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_cubNq;++j;@inner(1)){
       for(int i=0;i<p_cubNq;++i;@inner(0)){
diff --git a/solvers/cns/okl/cnsCubatureSurfaceQuad2D.okl b/solvers/cns/okl/cnsCubatureSurfaceQuad2D.okl
index e692fa7f7..5d18ea341 100644
--- a/solvers/cns/okl/cnsCubatureSurfaceQuad2D.okl
+++ b/solvers/cns/okl/cnsCubatureSurfaceQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -226,8 +226,6 @@ void surfaceTerms(const int face,
       }
     }
 
-    @barrier("local");
-
 
     //interpolate traces, store flux in register
     for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -269,8 +267,6 @@ void surfaceTerms(const int face,
         }
     }
 
-    @barrier("local");
-
     //write fluxes to @shared
     for(int i=0;i<p_cubNq;++i;@inner(0)){
       #pragma unroll p_Nfaces
@@ -342,8 +338,6 @@ void surfaceTerms(const int face,
         }
     }
 
-    @barrier("local");
-
     // for all face nodes of all elements
     // face 0 & 2
     for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -359,8 +353,6 @@ void surfaceTerms(const int face,
       }
     }
 
-    @barrier("local");
-
     // face 1 & 3
     for(int j=0;j<p_cubNq;++j;@inner(0)){
       if(j<p_Nq){
@@ -374,8 +366,6 @@ void surfaceTerms(const int face,
       }
     }
 
-    @barrier("local");
-
     for(int i=0;i<p_cubNq;++i;@inner(0)){
       if(i<p_Nq) {
         #pragma unroll p_Nq
diff --git a/solvers/cns/okl/cnsCubatureSurfaceQuad3D.okl b/solvers/cns/okl/cnsCubatureSurfaceQuad3D.okl
index b34e9328c..b87cfa8ef 100644
--- a/solvers/cns/okl/cnsCubatureSurfaceQuad3D.okl
+++ b/solvers/cns/okl/cnsCubatureSurfaceQuad3D.okl
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -27,21 +27,21 @@
 
 
 void upwindRoeAveraged(const dfloat nx,
-		       const dfloat ny,
-		       const dfloat nz,
-		       const dfloat rM,
-		       const dfloat ruM,
-		       const dfloat rvM,
-		       const dfloat rwM,       
-		       const dfloat rP,
-		       const dfloat ruP,
-		       const dfloat rvP,
-		       const dfloat rwP,
-		       dfloat *rflux,
-		       dfloat *ruflux,
-		       dfloat *rvflux,
-		       dfloat *rwflux){
-  
+                       const dfloat ny,
+                       const dfloat nz,
+                       const dfloat rM,
+                       const dfloat ruM,
+                       const dfloat rvM,
+                       const dfloat rwM,
+                       const dfloat rP,
+                       const dfloat ruP,
+                       const dfloat rvP,
+                       const dfloat rwP,
+                       dfloat *rflux,
+                       dfloat *ruflux,
+                       dfloat *rvflux,
+                       dfloat *rwflux){
+
   dfloat sqrtrM = sqrt(rM);
   dfloat sqrtrP = sqrt(rP);
 
@@ -124,62 +124,62 @@ void upwindRoeAveraged(const dfloat nx,
 
 
 
-void quadSurfaceTerms(const int face, 
-                      const int m, 
-                      const int i, 
+void quadSurfaceTerms(const int face,
+                      const int m,
+                      const int i,
                       const int j,
                       const dfloat time,
                       dfloat s_cubProjectT[p_cubNq][p_Nq],
                       dfloat s_rflux[p_Nfaces][p_cubNq],
                       dfloat s_ruflux[p_Nfaces][p_cubNq],
                       dfloat s_rvflux[p_Nfaces][p_cubNq],
-		      dfloat s_rwflux[p_Nfaces][p_cubNq],
+                      dfloat s_rwflux[p_Nfaces][p_cubNq],
                       dfloat s_rhsq[p_Nfields][p_Nq][p_Nq]){
-  
-  dfloat r = 0.f, ru = 0.f, rv = 0.f, rw = 0.f;           
 
-  for(int n=0;n<p_cubNq;++n){                 
-    const dfloat Pni = s_cubProjectT[n][m];   
-    r   += Pni*s_rflux [face][n];             
-    ru  += Pni*s_ruflux[face][n];             
+  dfloat r = 0.f, ru = 0.f, rv = 0.f, rw = 0.f;
+
+  for(int n=0;n<p_cubNq;++n){
+    const dfloat Pni = s_cubProjectT[n][m];
+    r   += Pni*s_rflux [face][n];
+    ru  += Pni*s_ruflux[face][n];
     rv  += Pni*s_rvflux[face][n];
-    rw  += Pni*s_rwflux[face][n];             
-  }                                           
+    rw  += Pni*s_rwflux[face][n];
+  }
 
-  s_rhsq[0][j][i] += r;                 
-  s_rhsq[1][j][i] += ru;                        
+  s_rhsq[0][j][i] += r;
+  s_rhsq[1][j][i] += ru;
   s_rhsq[2][j][i] += rv;
-  s_rhsq[3][j][i] += rw;                        
+  s_rhsq[3][j][i] += rw;
 }
 
 // batch process elements
 @kernel void cnsCubatureSurfaceQuad3D_old(const dlong Nelements,
-				      const int advSwitch,
-				      @restrict const  dfloat *  vgeo,
-				      @restrict const  dfloat *  cubsgeo,
-				      @restrict const  dlong  *  vmapM,
-				      @restrict const  dlong  *  vmapP,
-				      @restrict const  int    *  EToB,
-				      @restrict const  dfloat *  cubInterpT,
-				      @restrict const  dfloat *  cubProjectT,
-				      const dfloat time,
-				      @restrict const  dfloat *  intx,
-					  @restrict const  dfloat *  inty,
-				      @restrict const  dfloat *  intz,      
-				      const dfloat mu,
-				      const dfloat intfx,
-				      const dfloat intfy,
-				      const dfloat intfz, 
-				      @restrict const  dfloat *  q,
-				      @restrict const  dfloat *  viscousStresses,
-				      @restrict dfloat *  rhsq){
-  
+                                      const int advSwitch,
+                                      @restrict const  dfloat *  vgeo,
+                                      @restrict const  dfloat *  cubsgeo,
+                                      @restrict const  dlong  *  vmapM,
+                                      @restrict const  dlong  *  vmapP,
+                                      @restrict const  int    *  EToB,
+                                      @restrict const  dfloat *  cubInterpT,
+                                      @restrict const  dfloat *  cubProjectT,
+                                      const dfloat time,
+                                      @restrict const  dfloat *  intx,
+                                      @restrict const  dfloat *  inty,
+                                      @restrict const  dfloat *  intz,
+                                      const dfloat mu,
+                                      const dfloat intfx,
+                                      const dfloat intfy,
+                                      const dfloat intfz,
+                                      @restrict const  dfloat *  q,
+                                      @restrict const  dfloat *  viscousStresses,
+                                      @restrict dfloat *  rhsq){
+
   // for all elements
   for(dlong e=0;e<Nelements;e++;@outer(0)){
-    
+
     // @shared storage for flux terms
     @shared dfloat s_rhsq[p_Nfields][p_Nq][p_Nq];
-    
+
     @shared dfloat s_qM[p_Nfields][p_Nfaces][p_cubNq];
     @shared dfloat s_qP[p_Nfields][p_Nfaces][p_cubNq];
     @shared dfloat s_vSM[p_Nstresses][p_Nfaces][p_cubNq];
@@ -201,192 +201,189 @@ void quadSurfaceTerms(const int face,
       if(i<p_Nq){
 
 #pragma unroll p_Nfaces
-	for (int face=0;face<p_Nfaces;face++) {
-	  const dlong id  = e*p_Nfp*p_Nfaces + face*p_Nq + i;
-	  const dlong idM = vmapM[id];
-	  const dlong idP = vmapP[id];
-
-	  const dlong eM = e;
-	  const dlong eP = idP/p_Np;
-	  const int vidM = idM%p_Np;
-	  const int vidP = idP%p_Np;
-
-	  const dlong qbaseM = eM*p_Np*p_Nfields + vidM;
-	  const dlong qbaseP = eP*p_Np*p_Nfields + vidP;
-
-	  const dlong sbaseM = eM*p_Np*p_Nstresses + vidM;
-	  const dlong sbaseP = eP*p_Np*p_Nstresses + vidP;
-
-	  s_qM[0][face][i] = q[qbaseM + 0*p_Np];
-	  s_qM[1][face][i] = q[qbaseM + 1*p_Np];
-	  s_qM[2][face][i] = q[qbaseM + 2*p_Np];
-	  s_qM[3][face][i] = q[qbaseM + 3*p_Np];
-
-	  s_qP[0][face][i] = q[qbaseP + 0*p_Np];
-	  s_qP[1][face][i] = q[qbaseP + 1*p_Np];
-	  s_qP[2][face][i] = q[qbaseP + 2*p_Np];
-	  s_qP[3][face][i] = q[qbaseP + 3*p_Np];
-
-	  s_vSM[0][face][i] = viscousStresses[sbaseM+0*p_Np];
-	  s_vSM[1][face][i] = viscousStresses[sbaseM+1*p_Np];
-	  s_vSM[2][face][i] = viscousStresses[sbaseM+2*p_Np];
-	  s_vSM[3][face][i] = viscousStresses[sbaseM+3*p_Np];
-	  s_vSM[4][face][i] = viscousStresses[sbaseM+4*p_Np];
-	  s_vSM[5][face][i] = viscousStresses[sbaseM+5*p_Np];
-
-	  s_vSP[0][face][i] = viscousStresses[sbaseP+0*p_Np];
-	  s_vSP[1][face][i] = viscousStresses[sbaseP+1*p_Np];
-	  s_vSP[2][face][i] = viscousStresses[sbaseP+2*p_Np];
-	  s_vSP[3][face][i] = viscousStresses[sbaseP+3*p_Np];
-	  s_vSP[4][face][i] = viscousStresses[sbaseP+4*p_Np];
-	  s_vSP[5][face][i] = viscousStresses[sbaseP+5*p_Np];
-	}
-      
-	//zero out resulting surface contributions
+        for (int face=0;face<p_Nfaces;face++) {
+          const dlong id  = e*p_Nfp*p_Nfaces + face*p_Nq + i;
+          const dlong idM = vmapM[id];
+          const dlong idP = vmapP[id];
+
+          const dlong eM = e;
+          const dlong eP = idP/p_Np;
+          const int vidM = idM%p_Np;
+          const int vidP = idP%p_Np;
+
+          const dlong qbaseM = eM*p_Np*p_Nfields + vidM;
+          const dlong qbaseP = eP*p_Np*p_Nfields + vidP;
+
+          const dlong sbaseM = eM*p_Np*p_Nstresses + vidM;
+          const dlong sbaseP = eP*p_Np*p_Nstresses + vidP;
+
+          s_qM[0][face][i] = q[qbaseM + 0*p_Np];
+          s_qM[1][face][i] = q[qbaseM + 1*p_Np];
+          s_qM[2][face][i] = q[qbaseM + 2*p_Np];
+          s_qM[3][face][i] = q[qbaseM + 3*p_Np];
+
+          s_qP[0][face][i] = q[qbaseP + 0*p_Np];
+          s_qP[1][face][i] = q[qbaseP + 1*p_Np];
+          s_qP[2][face][i] = q[qbaseP + 2*p_Np];
+          s_qP[3][face][i] = q[qbaseP + 3*p_Np];
+
+          s_vSM[0][face][i] = viscousStresses[sbaseM+0*p_Np];
+          s_vSM[1][face][i] = viscousStresses[sbaseM+1*p_Np];
+          s_vSM[2][face][i] = viscousStresses[sbaseM+2*p_Np];
+          s_vSM[3][face][i] = viscousStresses[sbaseM+3*p_Np];
+          s_vSM[4][face][i] = viscousStresses[sbaseM+4*p_Np];
+          s_vSM[5][face][i] = viscousStresses[sbaseM+5*p_Np];
+
+          s_vSP[0][face][i] = viscousStresses[sbaseP+0*p_Np];
+          s_vSP[1][face][i] = viscousStresses[sbaseP+1*p_Np];
+          s_vSP[2][face][i] = viscousStresses[sbaseP+2*p_Np];
+          s_vSP[3][face][i] = viscousStresses[sbaseP+3*p_Np];
+          s_vSP[4][face][i] = viscousStresses[sbaseP+4*p_Np];
+          s_vSP[5][face][i] = viscousStresses[sbaseP+5*p_Np];
+        }
+
+        //zero out resulting surface contributions
 #pragma unroll p_Nq
-	for(int j=0;j<p_Nq;++j){
-	  s_rhsq[0][j][i] = 0.;
-	  s_rhsq[1][j][i] = 0.;
-	  s_rhsq[2][j][i] = 0.;
-	  s_rhsq[3][j][i] = 0.;
-	}
+        for(int j=0;j<p_Nq;++j){
+          s_rhsq[0][j][i] = 0.;
+          s_rhsq[1][j][i] = 0.;
+          s_rhsq[2][j][i] = 0.;
+          s_rhsq[3][j][i] = 0.;
+        }
       }
-    
+
       //fetch reference operators
 #pragma unroll p_Nq
       for(int j=0;j<p_Nq;++j){
-	const int id = i+j*p_cubNq;
-	s_cubInterpT[0][id] = cubInterpT[id];
-	s_cubProjectT[0][id] = cubProjectT[id];
+        const int id = i+j*p_cubNq;
+        s_cubInterpT[0][id] = cubInterpT[id];
+        s_cubProjectT[0][id] = cubProjectT[id];
       }
     }
 
-    @barrier("local");
 
-    //interpolate traces, store flux in register 
+    //interpolate traces, store flux in register
     for(int i=0;i<p_cubNq;++i;@inner(0)){
 
       for (int n=0;n<p_Nfields*p_Nfaces;++n){
-	r_qM[n] = 0.;
-	r_qP[n] = 0.;
+        r_qM[n] = 0.;
+        r_qP[n] = 0.;
       }
 
       for (int n=0;n<p_Nstresses*p_Nfaces;++n){
-	r_vSM[n] = 0.;
-	r_vSP[n] = 0.;
+        r_vSM[n] = 0.;
+        r_vSP[n] = 0.;
       }
 
       //#pragma unroll p_Nq TW: CAUTION THIS UNROLL MAY CAUSE CATASTROPHIC BUG
       for (int n=0;n<p_Nq;n++) {
-	const dfloat Ini = s_cubInterpT[n][i];
+        const dfloat Ini = s_cubInterpT[n][i];
 
 #pragma unroll p_Nfaces
-	for (int face=0;face<p_Nfaces;face++) {
-	  r_qM[0*p_Nfaces+face] += Ini*s_qM[0][face][n];
-	  r_qM[1*p_Nfaces+face] += Ini*s_qM[1][face][n];
-	  r_qM[2*p_Nfaces+face] += Ini*s_qM[2][face][n];
-	  r_qM[3*p_Nfaces+face] += Ini*s_qM[3][face][n];
-
-	  r_qP[0*p_Nfaces+face] += Ini*s_qP[0][face][n];
-	  r_qP[1*p_Nfaces+face] += Ini*s_qP[1][face][n];
-	  r_qP[2*p_Nfaces+face] += Ini*s_qP[2][face][n];
-	  r_qP[3*p_Nfaces+face] += Ini*s_qP[3][face][n];
-	      
-	  r_vSM[0*p_Nfaces+face] += Ini*s_vSM[0][face][n];
-	  r_vSM[1*p_Nfaces+face] += Ini*s_vSM[1][face][n];
-	  r_vSM[2*p_Nfaces+face] += Ini*s_vSM[2][face][n];
-	  r_vSM[3*p_Nfaces+face] += Ini*s_vSM[3][face][n];
-	  r_vSM[4*p_Nfaces+face] += Ini*s_vSM[4][face][n];
-	  r_vSM[5*p_Nfaces+face] += Ini*s_vSM[5][face][n];
-	      
-	  r_vSP[0*p_Nfaces+face] += Ini*s_vSP[0][face][n];
-	  r_vSP[1*p_Nfaces+face] += Ini*s_vSP[1][face][n];
-	  r_vSP[2*p_Nfaces+face] += Ini*s_vSP[2][face][n];
-	  r_vSP[3*p_Nfaces+face] += Ini*s_vSP[3][face][n];
-	  r_vSP[4*p_Nfaces+face] += Ini*s_vSP[4][face][n];
-	  r_vSP[5*p_Nfaces+face] += Ini*s_vSP[5][face][n];
-	}
+        for (int face=0;face<p_Nfaces;face++) {
+          r_qM[0*p_Nfaces+face] += Ini*s_qM[0][face][n];
+          r_qM[1*p_Nfaces+face] += Ini*s_qM[1][face][n];
+          r_qM[2*p_Nfaces+face] += Ini*s_qM[2][face][n];
+          r_qM[3*p_Nfaces+face] += Ini*s_qM[3][face][n];
+
+          r_qP[0*p_Nfaces+face] += Ini*s_qP[0][face][n];
+          r_qP[1*p_Nfaces+face] += Ini*s_qP[1][face][n];
+          r_qP[2*p_Nfaces+face] += Ini*s_qP[2][face][n];
+          r_qP[3*p_Nfaces+face] += Ini*s_qP[3][face][n];
+
+          r_vSM[0*p_Nfaces+face] += Ini*s_vSM[0][face][n];
+          r_vSM[1*p_Nfaces+face] += Ini*s_vSM[1][face][n];
+          r_vSM[2*p_Nfaces+face] += Ini*s_vSM[2][face][n];
+          r_vSM[3*p_Nfaces+face] += Ini*s_vSM[3][face][n];
+          r_vSM[4*p_Nfaces+face] += Ini*s_vSM[4][face][n];
+          r_vSM[5*p_Nfaces+face] += Ini*s_vSM[5][face][n];
+
+          r_vSP[0*p_Nfaces+face] += Ini*s_vSP[0][face][n];
+          r_vSP[1*p_Nfaces+face] += Ini*s_vSP[1][face][n];
+          r_vSP[2*p_Nfaces+face] += Ini*s_vSP[2][face][n];
+          r_vSP[3*p_Nfaces+face] += Ini*s_vSP[3][face][n];
+          r_vSP[4*p_Nfaces+face] += Ini*s_vSP[4][face][n];
+          r_vSP[5*p_Nfaces+face] += Ini*s_vSP[5][face][n];
+        }
       }
     }
-    @barrier("local"); //need a barrier since s_fluxNU and s_fluxNV are aliased
 
     //write fluxes to @shared
     for(int i=0;i<p_cubNq;++i;@inner(0)){
       //#pragma unroll p_Nfaces
       for (int face=0;face<p_Nfaces;face++) {
-	const dlong sk = e*p_cubNq*p_Nfaces + face*p_cubNq + i;
-	
-	const dfloat nx = cubsgeo[sk*p_Nsgeo+p_NXID];
-	const dfloat ny = cubsgeo[sk*p_Nsgeo+p_NYID];
-	const dfloat nz = cubsgeo[sk*p_Nsgeo+p_NZID];
-	const dfloat sJ = cubsgeo[sk*p_Nsgeo+p_SJID];
-	const dfloat hinv = cubsgeo[sk*p_Nsgeo+p_IHID];
-
-	const dfloat rM  = r_qM[0*p_Nfaces+face];
-	const dfloat ruM = r_qM[1*p_Nfaces+face];
-	const dfloat rvM = r_qM[2*p_Nfaces+face];
-	const dfloat rwM = r_qM[3*p_Nfaces+face];
-      
-	dfloat rP  = r_qP[0*p_Nfaces+face];
-	dfloat ruP = r_qP[1*p_Nfaces+face];
-	dfloat rvP = r_qP[2*p_Nfaces+face];
-	dfloat rwP = r_qP[3*p_Nfaces+face];
-
-	const dfloat T11M = r_vSM[0*p_Nfaces+face];
-	const dfloat T12M = r_vSM[1*p_Nfaces+face];
-	const dfloat T13M = r_vSM[2*p_Nfaces+face];
-	const dfloat T22M = r_vSM[3*p_Nfaces+face];
-	const dfloat T23M = r_vSM[4*p_Nfaces+face];
-	const dfloat T33M = r_vSM[5*p_Nfaces+face];
-        
-	const dfloat T11P = r_vSP[0*p_Nfaces+face];
-	const dfloat T12P = r_vSP[1*p_Nfaces+face];
-	const dfloat T13P = r_vSP[2*p_Nfaces+face];
-	const dfloat T22P = r_vSP[3*p_Nfaces+face];
-	const dfloat T23P = r_vSP[4*p_Nfaces+face];
-	const dfloat T33P = r_vSP[5*p_Nfaces+face];
-
-	
+        const dlong sk = e*p_cubNq*p_Nfaces + face*p_cubNq + i;
+
+        const dfloat nx = cubsgeo[sk*p_Nsgeo+p_NXID];
+        const dfloat ny = cubsgeo[sk*p_Nsgeo+p_NYID];
+        const dfloat nz = cubsgeo[sk*p_Nsgeo+p_NZID];
+        const dfloat sJ = cubsgeo[sk*p_Nsgeo+p_SJID];
+        const dfloat hinv = cubsgeo[sk*p_Nsgeo+p_IHID];
+
+        const dfloat rM  = r_qM[0*p_Nfaces+face];
+        const dfloat ruM = r_qM[1*p_Nfaces+face];
+        const dfloat rvM = r_qM[2*p_Nfaces+face];
+        const dfloat rwM = r_qM[3*p_Nfaces+face];
+
+        dfloat rP  = r_qP[0*p_Nfaces+face];
+        dfloat ruP = r_qP[1*p_Nfaces+face];
+        dfloat rvP = r_qP[2*p_Nfaces+face];
+        dfloat rwP = r_qP[3*p_Nfaces+face];
+
+        const dfloat T11M = r_vSM[0*p_Nfaces+face];
+        const dfloat T12M = r_vSM[1*p_Nfaces+face];
+        const dfloat T13M = r_vSM[2*p_Nfaces+face];
+        const dfloat T22M = r_vSM[3*p_Nfaces+face];
+        const dfloat T23M = r_vSM[4*p_Nfaces+face];
+        const dfloat T33M = r_vSM[5*p_Nfaces+face];
+
+        const dfloat T11P = r_vSP[0*p_Nfaces+face];
+        const dfloat T12P = r_vSP[1*p_Nfaces+face];
+        const dfloat T13P = r_vSP[2*p_Nfaces+face];
+        const dfloat T22P = r_vSP[3*p_Nfaces+face];
+        const dfloat T23P = r_vSP[4*p_Nfaces+face];
+        const dfloat T33P = r_vSP[5*p_Nfaces+face];
+
+
 #if 0
-	const dfloat uM = ruM/rM;
-	const dfloat vM = rvM/rM;
-	const dfloat wM = rwM/rM;
-	const dfloat pM = p_RT*rM;
-
-	dfloat uP = ruP/rP;
-	dfloat vP = rvP/rP;
-	dfloat wP = rwP/rP;
-	dfloat pP = p_RT*rP;
-#endif	  
-	dfloat rflux, ruflux, rvflux, rwflux;
-	upwindRoeAveraged (nx, ny, nz,
-			   rM, ruM, rvM, rwM,
-			   rP, ruP, rvP, rwP, &rflux, &ruflux, &rvflux, &rwflux);
-
-	rflux *= advSwitch;
-	ruflux *= advSwitch;
-	rvflux *= advSwitch;
-	rwflux *= advSwitch;
-
-	// TW: WATCH OUT FOR THIS
-	ruflux -= p_half*(nx*(T11P+T11M) + ny*(T12P+T12M) + nz*(T13P+T13M));
-	rvflux -= p_half*(nx*(T12P+T12M) + ny*(T22P+T22M) + nz*(T23P+T23M));
-	rwflux -= p_half*(nx*(T13P+T13M) + ny*(T23P+T23M) + nz*(T33P+T33M));
-	
-	const dfloat penalty = mu*hinv*(p_Nq)*(p_Nq-1)*p_half; 
-	ruflux -= penalty*(ruP-ruM);
-	rvflux -= penalty*(rvP-rvM);
-	rwflux -= penalty*(rwP-rwM);
-	
-	s_rflux [face][i] = sJ*(-rflux);
-	s_ruflux[face][i] = sJ*(-ruflux);
-	s_rvflux[face][i] = sJ*(-rvflux);
-	s_rwflux[face][i] = sJ*(-rwflux);
+        const dfloat uM = ruM/rM;
+        const dfloat vM = rvM/rM;
+        const dfloat wM = rwM/rM;
+        const dfloat pM = p_RT*rM;
+
+        dfloat uP = ruP/rP;
+        dfloat vP = rvP/rP;
+        dfloat wP = rwP/rP;
+        dfloat pP = p_RT*rP;
+#endif
+        dfloat rflux, ruflux, rvflux, rwflux;
+        upwindRoeAveraged (nx, ny, nz,
+                           rM, ruM, rvM, rwM,
+                           rP, ruP, rvP, rwP, &rflux, &ruflux, &rvflux, &rwflux);
+
+        rflux *= advSwitch;
+        ruflux *= advSwitch;
+        rvflux *= advSwitch;
+        rwflux *= advSwitch;
+
+        // TW: WATCH OUT FOR THIS
+        ruflux -= p_half*(nx*(T11P+T11M) + ny*(T12P+T12M) + nz*(T13P+T13M));
+        rvflux -= p_half*(nx*(T12P+T12M) + ny*(T22P+T22M) + nz*(T23P+T23M));
+        rwflux -= p_half*(nx*(T13P+T13M) + ny*(T23P+T23M) + nz*(T33P+T33M));
+
+        const dfloat penalty = mu*hinv*(p_Nq)*(p_Nq-1)*p_half;
+        ruflux -= penalty*(ruP-ruM);
+        rvflux -= penalty*(rvP-rvM);
+        rwflux -= penalty*(rwP-rwM);
+
+        s_rflux [face][i] = sJ*(-rflux);
+        s_ruflux[face][i] = sJ*(-ruflux);
+        s_rvflux[face][i] = sJ*(-rvflux);
+        s_rwflux[face][i] = sJ*(-rwflux);
 
       }
     }
 
-    @barrier("local");
 
     // for all face nodes of all elements
     // face 0 & 2
@@ -396,14 +393,13 @@ void quadSurfaceTerms(const int face,
         quadSurfaceTerms(0, i, i, 0,
                          time, s_cubProjectT, s_rflux, s_ruflux, s_rvflux, s_rwflux, s_rhsq);
 
-        
+
         //        quadSurfaceTerms(2,i,i,p_Nq-1);
         quadSurfaceTerms(2, i, i, p_Nq-1,
                          time, s_cubProjectT, s_rflux, s_ruflux, s_rvflux, s_rwflux, s_rhsq);
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_cubNq;++j;@inner(0)){
@@ -411,29 +407,28 @@ void quadSurfaceTerms(const int face,
         //        quadSurfaceTerms(1,j,p_Nq-1,j);
         quadSurfaceTerms(1, j, p_Nq-1, j,
                          time, s_cubProjectT, s_rflux, s_ruflux, s_rvflux, s_rwflux, s_rhsq);
-       
+
         //        quadSurfaceTerms(3,j,0     ,j);
         quadSurfaceTerms(3, j, 0, j,
                          time, s_cubProjectT, s_rflux, s_ruflux, s_rvflux, s_rwflux, s_rhsq);
       }
     }
-  
-    @barrier("local");
-    
+
+
     for(int i=0;i<p_cubNq;++i;@inner(0)){
       if(i<p_Nq) {
 
 #pragma unroll p_Nq
-	for(int j=0;j<p_Nq;++j){
-	  const dlong gid = e*p_Np*p_Nvgeo+ j*p_Nq +i;
-	  const dfloat invJW = vgeo[gid + p_IJWID*p_Np];
-
-	  const dlong base = e*p_Np*p_Nfields+j*p_Nq+i;
-	  rhsq[base+0*p_Np] += invJW*s_rhsq[0][j][i];
-	  rhsq[base+1*p_Np] += invJW*s_rhsq[1][j][i];
-	  rhsq[base+2*p_Np] += invJW*s_rhsq[2][j][i];
-	  rhsq[base+3*p_Np] += invJW*s_rhsq[3][j][i];
-	}
+        for(int j=0;j<p_Nq;++j){
+          const dlong gid = e*p_Np*p_Nvgeo+ j*p_Nq +i;
+          const dfloat invJW = vgeo[gid + p_IJWID*p_Np];
+
+          const dlong base = e*p_Np*p_Nfields+j*p_Nq+i;
+          rhsq[base+0*p_Np] += invJW*s_rhsq[0][j][i];
+          rhsq[base+1*p_Np] += invJW*s_rhsq[1][j][i];
+          rhsq[base+2*p_Np] += invJW*s_rhsq[2][j][i];
+          rhsq[base+3*p_Np] += invJW*s_rhsq[3][j][i];
+        }
       }
     }
   }
@@ -443,32 +438,32 @@ void quadSurfaceTerms(const int face,
 
 // batch process elements
 @kernel void cnsCubatureSurfaceQuad3D(const dlong Nelements,
-				      const int advSwitch,
-				      @restrict const  dfloat *  vgeo,
-				      @restrict const  dfloat *  cubsgeo,
-				      @restrict const  dlong  *  vmapM,
-				      @restrict const  dlong  *  vmapP,
-				      @restrict const  int    *  EToB,
-				      @restrict const  dfloat *  cubInterpT,
-				      @restrict const  dfloat *  cubProjectT,
-				      const dfloat time,
-				      @restrict const  dfloat *  intx,
-				      @restrict const  dfloat *  inty,
-				      @restrict const  dfloat *  intz,      
-				      const dfloat mu,
-				      const dfloat intfx,
-				      const dfloat intfy,
-				      const dfloat intfz, 
-				      @restrict const  dfloat *  q,
-				      @restrict const  dfloat *  viscousStresses,
-				      @restrict dfloat *  rhsq){
-  
+                                      const int advSwitch,
+                                      @restrict const  dfloat *  vgeo,
+                                      @restrict const  dfloat *  cubsgeo,
+                                      @restrict const  dlong  *  vmapM,
+                                      @restrict const  dlong  *  vmapP,
+                                      @restrict const  int    *  EToB,
+                                      @restrict const  dfloat *  cubInterpT,
+                                      @restrict const  dfloat *  cubProjectT,
+                                      const dfloat time,
+                                      @restrict const  dfloat *  intx,
+                                      @restrict const  dfloat *  inty,
+                                      @restrict const  dfloat *  intz,
+                                      const dfloat mu,
+                                      const dfloat intfx,
+                                      const dfloat intfy,
+                                      const dfloat intfz,
+                                      @restrict const  dfloat *  q,
+                                      @restrict const  dfloat *  viscousStresses,
+                                      @restrict dfloat *  rhsq){
+
   // for all elements
   for(dlong e=0;e<Nelements;e++;@outer(0)){
-    
+
     // @shared storage for flux terms
     @shared dfloat s_rhsq[p_Nfields][p_Nq][p_Nq];
-    
+
     @shared dfloat s_qM[p_Nfields][p_cubNq];
     @shared dfloat s_qP[p_Nfields][p_cubNq];
     @shared dfloat s_vS[p_Nstresses][p_cubNq];
@@ -490,17 +485,17 @@ void quadSurfaceTerms(const int face,
 
       //zero out resulting surface contributions
       for(int j=0;j<p_Nq;++j){
-	s_rhsq[0][j][i] = 0.;
-	s_rhsq[1][j][i] = 0.;
-	s_rhsq[2][j][i] = 0.;
-	s_rhsq[3][j][i] = 0.;
+        s_rhsq[0][j][i] = 0.;
+        s_rhsq[1][j][i] = 0.;
+        s_rhsq[2][j][i] = 0.;
+        s_rhsq[3][j][i] = 0.;
       }
-      
+
       //fetch reference operators
       for(int j=0;j<p_Nq;++j){
-	const int id = i+j*p_cubNq;
-	s_cubInterpT[0][id] = cubInterpT[id];
-	s_cubProjectT[0][id] = cubProjectT[id];
+        const int id = i+j*p_cubNq;
+        s_cubInterpT[0][id] = cubInterpT[id];
+        s_cubProjectT[0][id] = cubProjectT[id];
       }
     }
 
@@ -508,130 +503,127 @@ void quadSurfaceTerms(const int face,
     //#pragma unroll p_Nfaces
     for (int face=0;face<p_Nfaces;face++) {
 
-      @barrier("local");
 
       for(int i=0;i<p_cubNq;++i;@inner(0)){
-	if(i<p_Nq){
-	  const dlong id  = e*p_Nfp*p_Nfaces + face*p_Nq + i;
-	  const dlong idM = vmapM[id];
-	  const dlong idP = vmapP[id];
-	  
-	  const dlong eM = e;
-	  const dlong eP = idP/p_Np;
-	  const int vidM = idM%p_Np;
-	  const int vidP = idP%p_Np;
-	  
-	  const dlong qbaseM = eM*p_Np*p_Nfields + vidM;
-	  const dlong qbaseP = eP*p_Np*p_Nfields + vidP;
-	  
-	  const dlong sbaseM = eM*p_Np*p_Nstresses + vidM;
-	  const dlong sbaseP = eP*p_Np*p_Nstresses + vidP;
-	  
-	  s_qM[0][i] = q[qbaseM + 0*p_Np];
-	  s_qM[1][i] = q[qbaseM + 1*p_Np];
-	  s_qM[2][i] = q[qbaseM + 2*p_Np];
-	  s_qM[3][i] = q[qbaseM + 3*p_Np];
-	  
-	  s_qP[0][i] = q[qbaseP + 0*p_Np];
-	  s_qP[1][i] = q[qbaseP + 1*p_Np];
-	  s_qP[2][i] = q[qbaseP + 2*p_Np];
-	  s_qP[3][i] = q[qbaseP + 3*p_Np];
-	  
-	  s_vS[0][i] = viscousStresses[sbaseM+0*p_Np]+viscousStresses[sbaseP+0*p_Np];
-	  s_vS[1][i] = viscousStresses[sbaseM+1*p_Np]+viscousStresses[sbaseP+1*p_Np];
-	  s_vS[2][i] = viscousStresses[sbaseM+2*p_Np]+viscousStresses[sbaseP+2*p_Np];
-	  s_vS[3][i] = viscousStresses[sbaseM+3*p_Np]+viscousStresses[sbaseP+3*p_Np];
-	  s_vS[4][i] = viscousStresses[sbaseM+4*p_Np]+viscousStresses[sbaseP+4*p_Np];
-	  s_vS[5][i] = viscousStresses[sbaseM+5*p_Np]+viscousStresses[sbaseP+5*p_Np];
-	}
+        if(i<p_Nq){
+          const dlong id  = e*p_Nfp*p_Nfaces + face*p_Nq + i;
+          const dlong idM = vmapM[id];
+          const dlong idP = vmapP[id];
+
+          const dlong eM = e;
+          const dlong eP = idP/p_Np;
+          const int vidM = idM%p_Np;
+          const int vidP = idP%p_Np;
+
+          const dlong qbaseM = eM*p_Np*p_Nfields + vidM;
+          const dlong qbaseP = eP*p_Np*p_Nfields + vidP;
+
+          const dlong sbaseM = eM*p_Np*p_Nstresses + vidM;
+          const dlong sbaseP = eP*p_Np*p_Nstresses + vidP;
+
+          s_qM[0][i] = q[qbaseM + 0*p_Np];
+          s_qM[1][i] = q[qbaseM + 1*p_Np];
+          s_qM[2][i] = q[qbaseM + 2*p_Np];
+          s_qM[3][i] = q[qbaseM + 3*p_Np];
+
+          s_qP[0][i] = q[qbaseP + 0*p_Np];
+          s_qP[1][i] = q[qbaseP + 1*p_Np];
+          s_qP[2][i] = q[qbaseP + 2*p_Np];
+          s_qP[3][i] = q[qbaseP + 3*p_Np];
+
+          s_vS[0][i] = viscousStresses[sbaseM+0*p_Np]+viscousStresses[sbaseP+0*p_Np];
+          s_vS[1][i] = viscousStresses[sbaseM+1*p_Np]+viscousStresses[sbaseP+1*p_Np];
+          s_vS[2][i] = viscousStresses[sbaseM+2*p_Np]+viscousStresses[sbaseP+2*p_Np];
+          s_vS[3][i] = viscousStresses[sbaseM+3*p_Np]+viscousStresses[sbaseP+3*p_Np];
+          s_vS[4][i] = viscousStresses[sbaseM+4*p_Np]+viscousStresses[sbaseP+4*p_Np];
+          s_vS[5][i] = viscousStresses[sbaseM+5*p_Np]+viscousStresses[sbaseP+5*p_Np];
+        }
       }
-      
-      @barrier("local");
-      
+
+
       for(int i=0;i<p_cubNq;++i;@inner(0)){
-	
-	for (int n=0;n<p_Nfields;++n){
-	  r_qM[n] = 0.;
-	  r_qP[n] = 0.;
-	}
-	
-	for (int n=0;n<p_Nstresses*p_Nfaces;++n){
-	  r_vS[n] = 0.;
-	}
-	
-#pragma unroll p_Nq 
-	for (int n=0;n<p_Nq;n++) {
-	  const dfloat Ini = s_cubInterpT[n][i];
-	  r_qM[0] += Ini*s_qM[0][n];
-	  r_qM[1] += Ini*s_qM[1][n];
-	  r_qM[2] += Ini*s_qM[2][n];
-	  r_qM[3] += Ini*s_qM[3][n];
-	  
-	  r_qP[0] += Ini*s_qP[0][n];
-	  r_qP[1] += Ini*s_qP[1][n];
-	  r_qP[2] += Ini*s_qP[2][n];
-	  r_qP[3] += Ini*s_qP[3][n];
-	  
-	  r_vS[0] += Ini*s_vS[0][n];
-	  r_vS[1] += Ini*s_vS[1][n];
-	  r_vS[2] += Ini*s_vS[2][n];
-	  r_vS[3] += Ini*s_vS[3][n];
-	  r_vS[4] += Ini*s_vS[4][n];
-	  r_vS[5] += Ini*s_vS[5][n];
-	}
-	
-	const dlong sk = e*p_cubNq*p_Nfaces + face*p_cubNq + i;
-	
-	const dfloat nx = cubsgeo[sk*p_Nsgeo+p_NXID];
-	const dfloat ny = cubsgeo[sk*p_Nsgeo+p_NYID];
-	const dfloat nz = cubsgeo[sk*p_Nsgeo+p_NZID];
-	const dfloat sJ = cubsgeo[sk*p_Nsgeo+p_SJID];
-	const dfloat hinv = cubsgeo[sk*p_Nsgeo+p_IHID];
-	
-	const dfloat rM  = r_qM[0];
-	const dfloat ruM = r_qM[1];
-	const dfloat rvM = r_qM[2];
-	const dfloat rwM = r_qM[3];
-	
-	dfloat rP  = r_qP[0];
-	dfloat ruP = r_qP[1];
-	dfloat rvP = r_qP[2];
-	dfloat rwP = r_qP[3];
-	
-	const dfloat T11 = r_vS[0];
-	const dfloat T12 = r_vS[1];
-	const dfloat T13 = r_vS[2];
-	const dfloat T22 = r_vS[3];
-	const dfloat T23 = r_vS[4];
-	const dfloat T33 = r_vS[5];
-	
-	dfloat rflux, ruflux, rvflux, rwflux;
-	upwindRoeAveraged (nx, ny, nz, rM, ruM, rvM, rwM, rP, ruP, rvP, rwP, &rflux, &ruflux, &rvflux, &rwflux);
-	
-	rflux *= advSwitch;
-	ruflux *= advSwitch;
-	rvflux *= advSwitch;
-	rwflux *= advSwitch;
-	
-	// TW: WATCH OUT FOR THIS
-	ruflux -= p_half*(nx*(T11) + ny*(T12) + nz*(T13));
-	rvflux -= p_half*(nx*(T12) + ny*(T22) + nz*(T23));
-	rwflux -= p_half*(nx*(T13) + ny*(T23) + nz*(T33));
-	
-	const dfloat penalty = mu*hinv*(p_Nq)*(p_Nq-1)*p_half; 
-	ruflux -= penalty*(ruP-ruM);
-	rvflux -= penalty*(rvP-rvM);
-	rwflux -= penalty*(rwP-rwM);
-	
-	s_rflux [face][i] = sJ*(-rflux);
-	s_ruflux[face][i] = sJ*(-ruflux);
-	s_rvflux[face][i] = sJ*(-rvflux);
-	s_rwflux[face][i] = sJ*(-rwflux);
-	
+
+        for (int n=0;n<p_Nfields;++n){
+          r_qM[n] = 0.;
+          r_qP[n] = 0.;
+        }
+
+        for (int n=0;n<p_Nstresses*p_Nfaces;++n){
+          r_vS[n] = 0.;
+        }
+
+#pragma unroll p_Nq
+        for (int n=0;n<p_Nq;n++) {
+          const dfloat Ini = s_cubInterpT[n][i];
+          r_qM[0] += Ini*s_qM[0][n];
+          r_qM[1] += Ini*s_qM[1][n];
+          r_qM[2] += Ini*s_qM[2][n];
+          r_qM[3] += Ini*s_qM[3][n];
+
+          r_qP[0] += Ini*s_qP[0][n];
+          r_qP[1] += Ini*s_qP[1][n];
+          r_qP[2] += Ini*s_qP[2][n];
+          r_qP[3] += Ini*s_qP[3][n];
+
+          r_vS[0] += Ini*s_vS[0][n];
+          r_vS[1] += Ini*s_vS[1][n];
+          r_vS[2] += Ini*s_vS[2][n];
+          r_vS[3] += Ini*s_vS[3][n];
+          r_vS[4] += Ini*s_vS[4][n];
+          r_vS[5] += Ini*s_vS[5][n];
+        }
+
+        const dlong sk = e*p_cubNq*p_Nfaces + face*p_cubNq + i;
+
+        const dfloat nx = cubsgeo[sk*p_Nsgeo+p_NXID];
+        const dfloat ny = cubsgeo[sk*p_Nsgeo+p_NYID];
+        const dfloat nz = cubsgeo[sk*p_Nsgeo+p_NZID];
+        const dfloat sJ = cubsgeo[sk*p_Nsgeo+p_SJID];
+        const dfloat hinv = cubsgeo[sk*p_Nsgeo+p_IHID];
+
+        const dfloat rM  = r_qM[0];
+        const dfloat ruM = r_qM[1];
+        const dfloat rvM = r_qM[2];
+        const dfloat rwM = r_qM[3];
+
+        dfloat rP  = r_qP[0];
+        dfloat ruP = r_qP[1];
+        dfloat rvP = r_qP[2];
+        dfloat rwP = r_qP[3];
+
+        const dfloat T11 = r_vS[0];
+        const dfloat T12 = r_vS[1];
+        const dfloat T13 = r_vS[2];
+        const dfloat T22 = r_vS[3];
+        const dfloat T23 = r_vS[4];
+        const dfloat T33 = r_vS[5];
+
+        dfloat rflux, ruflux, rvflux, rwflux;
+        upwindRoeAveraged (nx, ny, nz, rM, ruM, rvM, rwM, rP, ruP, rvP, rwP, &rflux, &ruflux, &rvflux, &rwflux);
+
+        rflux *= advSwitch;
+        ruflux *= advSwitch;
+        rvflux *= advSwitch;
+        rwflux *= advSwitch;
+
+        // TW: WATCH OUT FOR THIS
+        ruflux -= p_half*(nx*(T11) + ny*(T12) + nz*(T13));
+        rvflux -= p_half*(nx*(T12) + ny*(T22) + nz*(T23));
+        rwflux -= p_half*(nx*(T13) + ny*(T23) + nz*(T33));
+
+        const dfloat penalty = mu*hinv*(p_Nq)*(p_Nq-1)*p_half;
+        ruflux -= penalty*(ruP-ruM);
+        rvflux -= penalty*(rvP-rvM);
+        rwflux -= penalty*(rwP-rwM);
+
+        s_rflux [face][i] = sJ*(-rflux);
+        s_ruflux[face][i] = sJ*(-ruflux);
+        s_rvflux[face][i] = sJ*(-rvflux);
+        s_rwflux[face][i] = sJ*(-rwflux);
+
       }
     }
 
-    @barrier("local");
 
     // for all face nodes of all elements
     // face 0 & 2
@@ -651,7 +643,6 @@ void quadSurfaceTerms(const int face,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_cubNq;++j;@inner(0)){
@@ -669,23 +660,22 @@ void quadSurfaceTerms(const int face,
                          time, s_cubProjectT, s_rflux, s_ruflux, s_rvflux, s_rwflux, s_rhsq);
       }
     }
-  
-    @barrier("local");
-    
+
+
     for(int i=0;i<p_cubNq;++i;@inner(0)){
       if(i<p_Nq) {
 
-	//#pragma unroll p_Nq
-	for(int j=0;j<p_Nq;++j){
-	  const dlong gid = e*p_Np*p_Nvgeo+ j*p_Nq +i;
-	  const dfloat invJW = vgeo[gid + p_IJWID*p_Np];
-
-	  const dlong base = e*p_Np*p_Nfields+j*p_Nq+i;
-	  rhsq[base+0*p_Np] += invJW*s_rhsq[0][j][i];
-	  rhsq[base+1*p_Np] += invJW*s_rhsq[1][j][i];
-	  rhsq[base+2*p_Np] += invJW*s_rhsq[2][j][i];
-	  rhsq[base+3*p_Np] += invJW*s_rhsq[3][j][i];
-	}
+        //#pragma unroll p_Nq
+        for(int j=0;j<p_Nq;++j){
+          const dlong gid = e*p_Np*p_Nvgeo+ j*p_Nq +i;
+          const dfloat invJW = vgeo[gid + p_IJWID*p_Np];
+
+          const dlong base = e*p_Np*p_Nfields+j*p_Nq+i;
+          rhsq[base+0*p_Np] += invJW*s_rhsq[0][j][i];
+          rhsq[base+1*p_Np] += invJW*s_rhsq[1][j][i];
+          rhsq[base+2*p_Np] += invJW*s_rhsq[2][j][i];
+          rhsq[base+3*p_Np] += invJW*s_rhsq[3][j][i];
+        }
       }
     }
   }
diff --git a/solvers/cns/okl/cnsCubatureSurfaceTet3D.okl b/solvers/cns/okl/cnsCubatureSurfaceTet3D.okl
index 839cb3abf..de4b49bfa 100644
--- a/solvers/cns/okl/cnsCubatureSurfaceTet3D.okl
+++ b/solvers/cns/okl/cnsCubatureSurfaceTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -197,7 +197,6 @@ void upwindRoeAveraged(const dfloat nx,
           }
         }
 
-        @barrier("local");
 
         // interpolate to surface integration nodes
         for(int n=0;n<p_cubMaxNodes1;++n;@inner(0)){ // maxNodes = max(Nfp*Nfaces,Np)
@@ -324,7 +323,6 @@ void upwindRoeAveraged(const dfloat nx,
         }
 
         // wait for all @shared memory writes of the previous inner loop to complete
-        @barrier("local");
 
         // for each node in the element
         for(int n=0;n<p_cubMaxNodes1;++n;@inner(0)){
@@ -341,10 +339,8 @@ void upwindRoeAveraged(const dfloat nx,
               }
           }
         }
-        @barrier("local");
       }
 
-    @barrier("local");
 
     // for each node in the element
     for(int n=0;n<p_cubMaxNodes1;++n;@inner(0)){
@@ -358,4 +354,4 @@ void upwindRoeAveraged(const dfloat nx,
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/cns/okl/cnsCubatureSurfaceTri2D.okl b/solvers/cns/okl/cnsCubatureSurfaceTri2D.okl
index 78624e807..37df6555a 100644
--- a/solvers/cns/okl/cnsCubatureSurfaceTri2D.okl
+++ b/solvers/cns/okl/cnsCubatureSurfaceTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -165,7 +165,6 @@ void upwindRoeAveraged(const dfloat nx,
       }
     }
 
-    @barrier("local");
 
     // interpolate to surface integration nodes
     for(int n=0;n<p_cubMaxNodes;++n;@inner(0)){ // maxNodes = max(Nfp*Nfaces,Np)
@@ -258,7 +257,6 @@ void upwindRoeAveraged(const dfloat nx,
     }
 
     // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
 
     // for each node in the element
     for(int n=0;n<p_cubMaxNodes;++n;@inner(0)){
diff --git a/solvers/cns/okl/cnsCubatureVolumeHex3D.okl b/solvers/cns/okl/cnsCubatureVolumeHex3D.okl
index 975e4a1f7..6b5e1de99 100644
--- a/solvers/cns/okl/cnsCubatureVolumeHex3D.okl
+++ b/solvers/cns/okl/cnsCubatureVolumeHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -91,7 +91,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //read in and interpolate in k
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -138,7 +137,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     #pragma unroll p_cubNq
     for(int k=0;k<p_cubNq;++k){
@@ -156,7 +154,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -182,7 +179,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -197,7 +193,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -221,7 +216,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
     }
 
     //#pragma unroll p_cubNq
@@ -325,7 +319,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -344,7 +337,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
     } //k loop
 
     #pragma unroll p_cubNq
@@ -358,7 +350,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -378,7 +369,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -390,7 +380,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -410,7 +399,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
     }
 
     //project in k and write out
diff --git a/solvers/cns/okl/cnsCubatureVolumeQuad2D.okl b/solvers/cns/okl/cnsCubatureVolumeQuad2D.okl
index cf883d213..67c99f940 100644
--- a/solvers/cns/okl/cnsCubatureVolumeQuad2D.okl
+++ b/solvers/cns/okl/cnsCubatureVolumeQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -100,7 +100,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //interpolate in i, store in register
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -126,7 +125,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -144,7 +142,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //interpolate in j and store flux in register
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -168,7 +165,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_cubNq;++j;@inner(1)){
       for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -233,7 +229,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //project/differentiate in j
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -259,7 +254,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -277,7 +271,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //project/differentiate in i and write back
     for(int j=0;j<p_cubNq;++j;@inner(1)){
diff --git a/solvers/cns/okl/cnsCubatureVolumeQuad3D.okl b/solvers/cns/okl/cnsCubatureVolumeQuad3D.okl
index d329211e2..cef6a2b37 100644
--- a/solvers/cns/okl/cnsCubatureVolumeQuad3D.okl
+++ b/solvers/cns/okl/cnsCubatureVolumeQuad3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,22 +27,22 @@ SOFTWARE.
 
 // isothermal Compressible Navier-Stokes
 @kernel void cnsCubatureVolumeQuad3D(const dlong Nelements,
-				     const int advSwitch,
-				     const dfloat fx,
-				     const dfloat fy,
-				     const dfloat fz, 
-				     @restrict const  dfloat *  vgeo,
-				     @restrict const  dfloat *  x,
-				     @restrict const  dfloat *  y,
-				     @restrict const  dfloat *  z,
-				     @restrict const  dfloat *  cubvgeo,
-				     @restrict const  dfloat *  cubDWT,
-				     @restrict const  dfloat *  cubInterpT,
-				     @restrict const  dfloat *  cubProjectT,
-				     @restrict const  dfloat *  viscousStresses,
-				     @restrict const  dfloat *  q,
-				     @restrict dfloat *  rhsq){
-  
+                                     const int advSwitch,
+                                     const dfloat fx,
+                                     const dfloat fy,
+                                     const dfloat fz,
+                                     @restrict const  dfloat *  vgeo,
+                                     @restrict const  dfloat *  x,
+                                     @restrict const  dfloat *  y,
+                                     @restrict const  dfloat *  z,
+                                     @restrict const  dfloat *  cubvgeo,
+                                     @restrict const  dfloat *  cubDWT,
+                                     @restrict const  dfloat *  cubInterpT,
+                                     @restrict const  dfloat *  cubProjectT,
+                                     @restrict const  dfloat *  viscousStresses,
+                                     @restrict const  dfloat *  q,
+                                     @restrict dfloat *  rhsq){
+
   for(dlong e=0;e<Nelements;++e;@outer(0)){
 
     @shared dfloat s_F[p_Nfields][p_cubNq][p_cubNq];
@@ -66,22 +66,22 @@ SOFTWARE.
     @exclusive dfloat r_q[p_Nfields], r_vS[p_Nstresses];
     @exclusive dfloat r_F[p_Nfields], r_G[p_Nfields], r_H[p_Nfields];
 
-    for(int j=0;j<p_cubNq;++j;@inner(1)){ 
-      for(int i=0;i<p_cubNq;++i;@inner(0)){    
-        if((i<p_Nq) && (j<p_Nq)){ 
+    for(int j=0;j<p_cubNq;++j;@inner(1)){
+      for(int i=0;i<p_cubNq;++i;@inner(0)){
+        if((i<p_Nq) && (j<p_Nq)){
           // conserved variables
           const dlong  qbase = e*p_Np*p_Nfields + j*p_Nq + i;
           s_q[0][j][i] = q[qbase+0*p_Np];
           s_q[1][j][i] = q[qbase+1*p_Np];
           s_q[2][j][i] = q[qbase+2*p_Np];
-	  s_q[3][j][i] = q[qbase+3*p_Np];
-          
+          s_q[3][j][i] = q[qbase+3*p_Np];
+
           // viscous stresses (precomputed by cnsStressesVolumeQuad2D)
           const dlong id = e*p_Np*p_Nstresses + j*p_Nq + i;
           s_S11[j][i] = viscousStresses[id+0*p_Np];
           s_S12[j][i] = viscousStresses[id+1*p_Np];
           s_S13[j][i] = viscousStresses[id+2*p_Np];
-	  s_S22[j][i] = viscousStresses[id+3*p_Np];
+          s_S22[j][i] = viscousStresses[id+3*p_Np];
           s_S23[j][i] = viscousStresses[id+4*p_Np];
           s_S33[j][i] = viscousStresses[id+5*p_Np];
         }
@@ -95,15 +95,14 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //interpolate in i, store in register
-    for(int j=0;j<p_cubNq;++j;@inner(1)){ 
+    for(int j=0;j<p_cubNq;++j;@inner(1)){
       for(int i=0;i<p_cubNq;++i;@inner(0)){
         if (j<p_Nq) {
           r_q[0] = 0.; r_q[1] = 0.; r_q[2] = 0.; r_q[3] = 0.;
           r_vS[0] = 0.; r_vS[1] = 0.; r_vS[2] = 0.;
-	  r_vS[3] = 0.; r_vS[4] = 0.; r_vS[5] = 0.;
+          r_vS[3] = 0.; r_vS[4] = 0.; r_vS[5] = 0.;
 
           #pragma unroll p_Nq
           for (int n=0;n<p_Nq;n++) {
@@ -111,7 +110,7 @@ SOFTWARE.
             r_q[0] += Ini*s_q[0][j][n];
             r_q[1] += Ini*s_q[1][j][n];
             r_q[2] += Ini*s_q[2][j][n];
-	    r_q[3] += Ini*s_q[3][j][n];
+            r_q[3] += Ini*s_q[3][j][n];
             r_vS[0] += Ini*s_S11[j][n];
             r_vS[1] += Ini*s_S12[j][n];
             r_vS[2] += Ini*s_S13[j][n];
@@ -123,34 +122,32 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
-    for(int j=0;j<p_cubNq;++j;@inner(1)){ 
+    for(int j=0;j<p_cubNq;++j;@inner(1)){
       for(int i=0;i<p_cubNq;++i;@inner(0)){
         if (j<p_Nq) {
           s_q[0][j][i] = r_q[0];
           s_q[1][j][i] = r_q[1];
           s_q[2][j][i] = r_q[2];
-	  s_q[3][j][i] = r_q[3];
+          s_q[3][j][i] = r_q[3];
           s_S11[j][i] = r_vS[0];
           s_S12[j][i] = r_vS[1];
           s_S13[j][i] = r_vS[2];
-	  s_S22[j][i] = r_vS[3];
+          s_S22[j][i] = r_vS[3];
           s_S23[j][i] = r_vS[4];
           s_S33[j][i] = r_vS[5];
         }
       }
     }
 
-    @barrier("local");
 
     //interpolate in j and store flux in register
-    for(int j=0;j<p_cubNq;++j;@inner(1)){ 
+    for(int j=0;j<p_cubNq;++j;@inner(1)){
       for(int i=0;i<p_cubNq;++i;@inner(0)){
         r_q[0] = 0.; r_q[1] = 0.; r_q[2] = 0.; r_q[3] = 0.;
         r_vS[0] = 0.; r_vS[1] = 0.; r_vS[2] = 0.;
-	r_vS[3] = 0.; r_vS[4] = 0.; r_vS[5] = 0.;
+        r_vS[3] = 0.; r_vS[4] = 0.; r_vS[5] = 0.;
 
         #pragma unroll p_Nq
         for (int n=0;n<p_Nq;n++) {
@@ -158,18 +155,17 @@ SOFTWARE.
           r_q[0] += Inj*s_q[0][n][i];
           r_q[1] += Inj*s_q[1][n][i];
           r_q[2] += Inj*s_q[2][n][i];
-	  r_q[3] += Inj*s_q[3][n][i];
+          r_q[3] += Inj*s_q[3][n][i];
           r_vS[0] += Inj*s_S11[n][i];
           r_vS[1] += Inj*s_S12[n][i];
           r_vS[2] += Inj*s_S13[n][i];
-	  r_vS[3] += Inj*s_S22[n][i];
+          r_vS[3] += Inj*s_S22[n][i];
           r_vS[4] += Inj*s_S23[n][i];
           r_vS[5] += Inj*s_S33[n][i];
         }
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_cubNq;++j;@inner(1)){
       for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -177,19 +173,19 @@ SOFTWARE.
         const dlong gid = e*p_cubNp*p_Nvgeo+ j*p_cubNq +i;
         const dfloat rx = cubvgeo[gid + p_RXID*p_cubNp];
         const dfloat ry = cubvgeo[gid + p_RYID*p_cubNp];
-	const dfloat rz = cubvgeo[gid + p_RZID*p_cubNp];
+        const dfloat rz = cubvgeo[gid + p_RZID*p_cubNp];
         const dfloat sx = cubvgeo[gid + p_SXID*p_cubNp];
         const dfloat sy = cubvgeo[gid + p_SYID*p_cubNp];
-	const dfloat sz = cubvgeo[gid + p_SZID*p_cubNp];
-	const dfloat tx = cubvgeo[gid + p_TXID*p_cubNp];
+        const dfloat sz = cubvgeo[gid + p_SZID*p_cubNp];
+        const dfloat tx = cubvgeo[gid + p_TXID*p_cubNp];
         const dfloat ty = cubvgeo[gid + p_TYID*p_cubNp];
-	const dfloat tz = cubvgeo[gid + p_TZID*p_cubNp];
+        const dfloat tz = cubvgeo[gid + p_TZID*p_cubNp];
         const dfloat J  = cubvgeo[gid + p_JID*p_cubNp];
 
         const dfloat r  = r_q[0];
         const dfloat ru = r_q[1];
         const dfloat rv = r_q[2];
-	const dfloat rw = r_q[3];
+        const dfloat rw = r_q[3];
         const dfloat p  = r*p_RT;
 
         // primitive variables (velocity)
@@ -197,10 +193,10 @@ SOFTWARE.
 
         const dfloat T11 = r_vS[0];
         const dfloat T12 = r_vS[1];
-	const dfloat T13 = r_vS[2];
+        const dfloat T13 = r_vS[2];
         const dfloat T22 = r_vS[3];
-	const dfloat T23 = r_vS[4];
-	const dfloat T33 = r_vS[5];
+        const dfloat T23 = r_vS[4];
+        const dfloat T33 = r_vS[5];
 
         // (1/J) \hat{div} (G*[F;G])
 
@@ -208,51 +204,50 @@ SOFTWARE.
           // F0 = ru, G0 = rv
           const dfloat f = -advSwitch*ru;
           const dfloat g = -advSwitch*rv;
-	  const dfloat h = -advSwitch*rw;
+          const dfloat h = -advSwitch*rw;
           s_F[0][j][i] = J*(rx*f + ry*g + rz*h);
           s_G[0][j][i] = J*(sx*f + sy*g + sz*h);
-	  s_H[0][j][i] = J*(tx*f + ty*g + tz*h);
+          s_H[0][j][i] = J*(tx*f + ty*g + tz*h);
         }
 
         {
           // F1 = 2*mu*S11 - (ru^2+p), G1 = 2*mu*S12 - (rvu)
           const dfloat f = T11-advSwitch*(ru*u+p);
           const dfloat g = T12-advSwitch*(ru*v);
-	  const dfloat h = T13-advSwitch*(ru*w);
+          const dfloat h = T13-advSwitch*(ru*w);
           s_F[1][j][i] = J*(rx*f + ry*g + rz*h);
           s_G[1][j][i] = J*(sx*f + sy*g + sz*h);
-	  s_H[1][j][i] = J*(tx*f + ty*g + tz*h);
+          s_H[1][j][i] = J*(tx*f + ty*g + tz*h);
         }
 
         {
           // F2 = 2*mu*S21 - (ruv), G2 = 2*mu*S22 - (rv^2+p)
           const dfloat f = T12-advSwitch*(rv*u);
           const dfloat g = T22-advSwitch*(rv*v+p);
-	  const dfloat h = T23-advSwitch*(rv*w);
+          const dfloat h = T23-advSwitch*(rv*w);
           s_F[2][j][i] = J*(rx*f + ry*g + rz*h);
           s_G[2][j][i] = J*(sx*f + sy*g + sz*h);
-	  s_H[2][j][i] = J*(tx*f + ty*g + tz*h);
+          s_H[2][j][i] = J*(tx*f + ty*g + tz*h);
         }
 
-	{
+        {
           const dfloat f = T13-advSwitch*(rw*u);
           const dfloat g = T23-advSwitch*(rw*v);
-	  const dfloat h = T33-advSwitch*(rw*w+p);
+          const dfloat h = T33-advSwitch*(rw*w+p);
           s_F[3][j][i] = J*(rx*f + ry*g + rz*h);
           s_G[3][j][i] = J*(sx*f + sy*g + sz*h);
-	  s_H[3][j][i] = J*(tx*f + ty*g + tz*h);
+          s_H[3][j][i] = J*(tx*f + ty*g + tz*h);
         }
       }
     }
 
-    @barrier("local");
 
-    //project/differentiate in j 
-    for(int j=0;j<p_cubNq;++j;@inner(1)){ 
+    //project/differentiate in j
+    for(int j=0;j<p_cubNq;++j;@inner(1)){
       for(int i=0;i<p_cubNq;++i;@inner(0)){
         r_F[0] = 0.; r_F[1] = 0.; r_F[2] = 0.; r_F[3] = 0.;
         r_G[0] = 0.; r_G[1] = 0.; r_G[2] = 0.; r_G[3] = 0.;
-	r_H[0] = 0.; r_H[1] = 0.; r_H[2] = 0.; r_H[3] = 0.;
+        r_H[0] = 0.; r_H[1] = 0.; r_H[2] = 0.; r_H[3] = 0.;
 
         if (j<p_Nq) {
           #pragma unroll p_cubNq
@@ -262,53 +257,51 @@ SOFTWARE.
             r_F[0] += Pnj*s_F[0][n][i];
             r_F[1] += Pnj*s_F[1][n][i];
             r_F[2] += Pnj*s_F[2][n][i];
-	    r_F[3] += Pnj*s_F[3][n][i]; 
+            r_F[3] += Pnj*s_F[3][n][i];
             r_G[0] += Dnj*s_G[0][n][i];
             r_G[1] += Dnj*s_G[1][n][i];
-	    r_G[2] += Dnj*s_G[2][n][i];
-	    r_G[3] += Dnj*s_G[3][n][i];
-	    r_H[0] += Pnj*s_H[0][n][i];
+            r_G[2] += Dnj*s_G[2][n][i];
+            r_G[3] += Dnj*s_G[3][n][i];
+            r_H[0] += Pnj*s_H[0][n][i];
             r_H[1] += Pnj*s_H[1][n][i];
             r_H[2] += Pnj*s_H[2][n][i];
-	    r_H[3] += Pnj*s_H[3][n][i]; 
+            r_H[3] += Pnj*s_H[3][n][i];
           }
         }
       }
     }
 
-    @barrier("local");
 
-    //write register back to @shared 
-    for(int j=0;j<p_cubNq;++j;@inner(1)){ 
+    //write register back to @shared
+    for(int j=0;j<p_cubNq;++j;@inner(1)){
       for(int i=0;i<p_cubNq;++i;@inner(0)){
         if (j<p_Nq) {
           s_F[0][j][i] = r_F[0];
           s_F[1][j][i] = r_F[1];
           s_F[2][j][i] = r_F[2];
-	  s_F[3][j][i] = r_F[3];
+          s_F[3][j][i] = r_F[3];
           s_G[0][j][i] = r_G[0];
           s_G[1][j][i] = r_G[1];
           s_G[2][j][i] = r_G[2];
-	  s_G[3][j][i] = r_G[3];
-	  s_H[0][j][i] = r_H[0];
+          s_G[3][j][i] = r_G[3];
+          s_H[0][j][i] = r_H[0];
           s_H[1][j][i] = r_H[1];
           s_H[2][j][i] = r_H[2];
-	  s_H[3][j][i] = r_H[3];
+          s_H[3][j][i] = r_H[3];
         }
       }
     }
 
-    @barrier("local");
 
-    //project/differentiate in i and write back 
-    for(int j=0;j<p_cubNq;++j;@inner(1)){ 
-      for(int i=0;i<p_cubNq;++i;@inner(0)){  
-        if((i<p_Nq) && (j<p_Nq)){ 
+    //project/differentiate in i and write back
+    for(int j=0;j<p_cubNq;++j;@inner(1)){
+      for(int i=0;i<p_cubNq;++i;@inner(0)){
+        if((i<p_Nq) && (j<p_Nq)){
           const dlong gid = e*p_Np*p_Nvgeo+ j*p_Nq +i;
           const dfloat invJW = vgeo[gid + p_IJWID*p_Np];
-          
+
           dfloat rhsq0 = 0, rhsq1 = 0, rhsq2 = 0, rhsq3 = 0;
-          
+
           #pragma unroll p_cubNq
           for(int n=0;n<p_cubNq;++n){
             const dfloat Pni = s_cubProjectT[n][i];
@@ -317,27 +310,27 @@ SOFTWARE.
             rhsq0 += Dni*s_F[0][j][n] + Pni*s_G[0][j][n] + Pni*s_H[0][j][n];
             rhsq1 += Dni*s_F[1][j][n] + Pni*s_G[1][j][n] + Pni*s_H[1][j][n];
             rhsq2 += Dni*s_F[2][j][n] + Pni*s_G[2][j][n] + Pni*s_H[2][j][n];
-	    rhsq3 += Dni*s_F[3][j][n] + Pni*s_G[3][j][n] + Pni*s_H[3][j][n];
+            rhsq3 += Dni*s_F[3][j][n] + Pni*s_G[3][j][n] + Pni*s_H[3][j][n];
           }
 
-	  // remove radial component of momentum change
+          // remove radial component of momentum change
           // (note we do this before scaling with invJW, ok since pointwise)
-	  const dfloat xij = x[i+j*p_Nq+e*p_Np];
-	  const dfloat yij = y[i+j*p_Nq+e*p_Np];
-	  const dfloat zij = z[i+j*p_Nq+e*p_Np];
-	  const dfloat xdotrhsq = (rhsq1*xij + rhsq2*yij + rhsq3*zij)*p_invRadiusSq;
-	  
-	  rhsq1 -= xij*xdotrhsq;
-	  rhsq2 -= yij*xdotrhsq;
-	  rhsq3 -= zij*xdotrhsq;
-	  
+          const dfloat xij = x[i+j*p_Nq+e*p_Np];
+          const dfloat yij = y[i+j*p_Nq+e*p_Np];
+          const dfloat zij = z[i+j*p_Nq+e*p_Np];
+          const dfloat xdotrhsq = (rhsq1*xij + rhsq2*yij + rhsq3*zij)*p_invRadiusSq;
+
+          rhsq1 -= xij*xdotrhsq;
+          rhsq2 -= yij*xdotrhsq;
+          rhsq3 -= zij*xdotrhsq;
+
           const dlong base = e*p_Np*p_Nfields + j*p_Nq + i;
-          
+
           // move to rhs
-          rhsq[base+0*p_Np] = -invJW*rhsq0; 
+          rhsq[base+0*p_Np] = -invJW*rhsq0;
           rhsq[base+1*p_Np] = -invJW*rhsq1;
           rhsq[base+2*p_Np] = -invJW*rhsq2;
-	  rhsq[base+3*p_Np] = -invJW*rhsq3;
+          rhsq[base+3*p_Np] = -invJW*rhsq3;
         }
       }
     }
@@ -351,40 +344,39 @@ SOFTWARE.
                                     const dfloat mu,
                                     @restrict const  dfloat *  q,
                                     @restrict dfloat *  viscousStresses){
-  
+
   for(dlong e=0;e<Nelements;++e;@outer(0)){
 
     @shared dfloat s_D[p_Nq][p_Nq];
     @shared dfloat s_u[p_Nq][p_Nq];
     @shared dfloat s_v[p_Nq][p_Nq];
     @shared dfloat s_w[p_Nq][p_Nq];
-    
+
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
-        
+
         s_D[j][i] = D[j*p_Nq+i];
 
         const dlong qbase = e*p_Nfields*p_Np + j*p_Nq + i;
         const dfloat r  = q[qbase + 0*p_Np];
         const dfloat ru = q[qbase + 1*p_Np];
         const dfloat rv = q[qbase + 2*p_Np];
-	const dfloat rw = q[qbase + 3*p_Np];
-        
+        const dfloat rw = q[qbase + 3*p_Np];
+
         s_u[j][i] = ru/r;
         s_v[j][i] = rv/r;
-	s_w[j][i] = rw/r;
-        
+        s_w[j][i] = rw/r;
+
       }
     }
-    
-    @barrier("local");
-    
+
+
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
-        
+
         dfloat dudr = 0, duds = 0;
-	dfloat dvdr = 0, dvds = 0;
-	dfloat dwdr = 0, dwds = 0;
+        dfloat dvdr = 0, dvds = 0;
+        dfloat dwdr = 0, dwds = 0;
 
         for(int n=0;n<p_Nq;++n){
           const dfloat Din = s_D[i][n];
@@ -396,7 +388,7 @@ SOFTWARE.
           dvdr += Din*s_v[j][n];
           dvds += Djn*s_v[n][i];
 
-	  dwdr += Din*s_w[j][n];
+          dwdr += Din*s_w[j][n];
           dwds += Djn*s_w[n][i];
         }
 
@@ -404,46 +396,46 @@ SOFTWARE.
 
         const dfloat rx = vgeo[gbase+p_Np*p_RXID];
         const dfloat ry = vgeo[gbase+p_Np*p_RYID];
-	const dfloat rz = vgeo[gbase+p_Np*p_RZID];
-	
+        const dfloat rz = vgeo[gbase+p_Np*p_RZID];
+
         const dfloat sx = vgeo[gbase+p_Np*p_SXID];
         const dfloat sy = vgeo[gbase+p_Np*p_SYID];
-	const dfloat sz = vgeo[gbase+p_Np*p_SZID];
+        const dfloat sz = vgeo[gbase+p_Np*p_SZID];
 
-	const dfloat tx = vgeo[gbase+p_Np*p_TXID];
+        const dfloat tx = vgeo[gbase+p_Np*p_TXID];
         const dfloat ty = vgeo[gbase+p_Np*p_TYID];
-	const dfloat tz = vgeo[gbase+p_Np*p_TZID];
+        const dfloat tz = vgeo[gbase+p_Np*p_TZID];
 
         const dfloat dudx = rx*dudr + sx*duds + tx*s_u[j][i];
         const dfloat dudy = ry*dudr + sy*duds + ty*s_u[j][i];
-	const dfloat dudz = rz*dudr + sz*duds + tz*s_u[j][i];
-	
+        const dfloat dudz = rz*dudr + sz*duds + tz*s_u[j][i];
+
         const dfloat dvdx = rx*dvdr + sx*dvds + tx*s_v[j][i];
         const dfloat dvdy = ry*dvdr + sy*dvds + ty*s_v[j][i];
-	const dfloat dvdz = rz*dvdr + sz*dvds + tz*s_v[j][i];
+        const dfloat dvdz = rz*dvdr + sz*dvds + tz*s_v[j][i];
 
-	const dfloat dwdx = rx*dwdr + sx*dwds + tx*s_w[j][i];
+        const dfloat dwdx = rx*dwdr + sx*dwds + tx*s_w[j][i];
         const dfloat dwdy = ry*dwdr + sy*dwds + ty*s_w[j][i];
-	const dfloat dwdz = rz*dwdr + sz*dwds + tz*s_w[j][i];
-        
+        const dfloat dwdz = rz*dwdr + sz*dwds + tz*s_w[j][i];
+
         const dlong sbase = e*p_Nstresses*p_Np + j*p_Nq + i;
 
-	const dfloat divV = dudx+dvdy+dwdz;
+        const dfloat divV = dudx+dvdy+dwdz;
 
         const dfloat S11 = p_half*(dudx+dudx) - p_third*divV;
-	const dfloat S22 = p_half*(dvdy+dvdy) - p_third*divV;
-	const dfloat S33 = p_half*(dwdz+dwdz) - p_third*divV;
-	
+        const dfloat S22 = p_half*(dvdy+dvdy) - p_third*divV;
+        const dfloat S33 = p_half*(dwdz+dwdz) - p_third*divV;
+
         const dfloat S12 = p_half*(dudy+dvdx);
-	const dfloat S13 = p_half*(dudz+dwdx);
-	const dfloat S23 = p_half*(dvdz+dwdy);
-        
+        const dfloat S13 = p_half*(dudz+dwdx);
+        const dfloat S23 = p_half*(dvdz+dwdy);
+
         viscousStresses[sbase + 0*p_Np] = p_two*mu*S11;
         viscousStresses[sbase + 1*p_Np] = p_two*mu*S12;
-	viscousStresses[sbase + 2*p_Np] = p_two*mu*S13;
+        viscousStresses[sbase + 2*p_Np] = p_two*mu*S13;
         viscousStresses[sbase + 3*p_Np] = p_two*mu*S22;
-	viscousStresses[sbase + 4*p_Np] = p_two*mu*S23;
-	viscousStresses[sbase + 5*p_Np] = p_two*mu*S33;
+        viscousStresses[sbase + 4*p_Np] = p_two*mu*S23;
+        viscousStresses[sbase + 5*p_Np] = p_two*mu*S33;
       }
     }
   }
diff --git a/solvers/cns/okl/cnsCubatureVolumeTet3D.okl b/solvers/cns/okl/cnsCubatureVolumeTet3D.okl
index 5ef24efc7..dcfebcf77 100644
--- a/solvers/cns/okl/cnsCubatureVolumeTet3D.okl
+++ b/solvers/cns/okl/cnsCubatureVolumeTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -93,7 +93,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_cubNp;++n;@inner(0)){
       //interpolate to cubature
@@ -196,7 +195,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_cubNp;++n;@inner(0)){
       if (n<p_Np) {
diff --git a/solvers/cns/okl/cnsCubatureVolumeTri2D.okl b/solvers/cns/okl/cnsCubatureVolumeTri2D.okl
index 2fcfb13df..8ee905738 100644
--- a/solvers/cns/okl/cnsCubatureVolumeTri2D.okl
+++ b/solvers/cns/okl/cnsCubatureVolumeTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -84,7 +84,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_cubNp;++n;@inner(0)){
       //interpolate to cubature
@@ -129,7 +128,6 @@ SOFTWARE.
       s_G[3][n] = -v*(E+p) + mu*(u*T12 + v*T22);
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_cubNp;++n;@inner(0)){
       if (n<p_Np) {
diff --git a/solvers/cns/okl/cnsGradSurfaceHex3D.okl b/solvers/cns/okl/cnsGradSurfaceHex3D.okl
index 641731e9b..0662424d1 100644
--- a/solvers/cns/okl/cnsGradSurfaceHex3D.okl
+++ b/solvers/cns/okl/cnsGradSurfaceHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -146,7 +146,6 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("global");
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(2)){
@@ -170,7 +169,6 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("global");
 
     // face 2 & 4
     for(int es=0;es<p_NblockS;++es;@inner(2)){
diff --git a/solvers/cns/okl/cnsGradSurfaceQuad2D.okl b/solvers/cns/okl/cnsGradSurfaceQuad2D.okl
index 9bc516ee2..8b8d7ad11 100644
--- a/solvers/cns/okl/cnsGradSurfaceQuad2D.okl
+++ b/solvers/cns/okl/cnsGradSurfaceQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -131,7 +131,6 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("local");
 
     // for all face nodes of all elements
     // face 0 & 2
@@ -156,7 +155,6 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -179,7 +177,6 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
diff --git a/solvers/cns/okl/cnsGradSurfaceTet3D.okl b/solvers/cns/okl/cnsGradSurfaceTet3D.okl
index 7a6446c32..a565bef75 100644
--- a/solvers/cns/okl/cnsGradSurfaceTet3D.okl
+++ b/solvers/cns/okl/cnsGradSurfaceTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -125,7 +125,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
diff --git a/solvers/cns/okl/cnsGradSurfaceTri2D.okl b/solvers/cns/okl/cnsGradSurfaceTri2D.okl
index 11617f5b1..8fdf7e077 100644
--- a/solvers/cns/okl/cnsGradSurfaceTri2D.okl
+++ b/solvers/cns/okl/cnsGradSurfaceTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -112,7 +112,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
diff --git a/solvers/cns/okl/cnsGradVolumeHex3D.okl b/solvers/cns/okl/cnsGradVolumeHex3D.okl
index 9d91882b0..c63a4baf7 100644
--- a/solvers/cns/okl/cnsGradVolumeHex3D.okl
+++ b/solvers/cns/okl/cnsGradVolumeHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -57,7 +57,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int k=0;k<p_Nq;++k;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
diff --git a/solvers/cns/okl/cnsGradVolumeQuad2D.okl b/solvers/cns/okl/cnsGradVolumeQuad2D.okl
index 939aa2eb0..6937c1215 100644
--- a/solvers/cns/okl/cnsGradVolumeQuad2D.okl
+++ b/solvers/cns/okl/cnsGradVolumeQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -51,7 +51,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
diff --git a/solvers/cns/okl/cnsGradVolumeTet3D.okl b/solvers/cns/okl/cnsGradVolumeTet3D.okl
index cae2562fc..fa3568696 100644
--- a/solvers/cns/okl/cnsGradVolumeTet3D.okl
+++ b/solvers/cns/okl/cnsGradVolumeTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -48,7 +48,6 @@ SOFTWARE.
       s_w[n] = rw/r;
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Np;++n;@inner(0)){
       // prefetch geometric factors (constant on tetrahedra)
diff --git a/solvers/cns/okl/cnsGradVolumeTri2D.okl b/solvers/cns/okl/cnsGradVolumeTri2D.okl
index 30a295266..2866f5c17 100644
--- a/solvers/cns/okl/cnsGradVolumeTri2D.okl
+++ b/solvers/cns/okl/cnsGradVolumeTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -45,7 +45,6 @@ SOFTWARE.
       s_v[n] = rv/r;
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Np;++n;@inner(0)){
       // prefetch geometric factors (constant on triangle)
diff --git a/solvers/cns/okl/cnsInitialCondition2D.okl b/solvers/cns/okl/cnsInitialCondition2D.okl
index ff1961e81..b7d93205b 100644
--- a/solvers/cns/okl/cnsInitialCondition2D.okl
+++ b/solvers/cns/okl/cnsInitialCondition2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/cns/okl/cnsInitialCondition3D.okl b/solvers/cns/okl/cnsInitialCondition3D.okl
index f9beac438..dd30dee42 100644
--- a/solvers/cns/okl/cnsInitialCondition3D.okl
+++ b/solvers/cns/okl/cnsInitialCondition3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/cns/okl/cnsIsothermalCubatureSurfaceHex3D.okl b/solvers/cns/okl/cnsIsothermalCubatureSurfaceHex3D.okl
index 5f8528367..26dc3b7cf 100644
--- a/solvers/cns/okl/cnsIsothermalCubatureSurfaceHex3D.okl
+++ b/solvers/cns/okl/cnsIsothermalCubatureSurfaceHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -116,7 +116,6 @@ void upwindRoeAveraged(const dfloat nx,
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -145,7 +144,6 @@ void upwindRoeAveraged(const dfloat nx,
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -162,7 +160,6 @@ void upwindRoeAveraged(const dfloat nx,
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -188,7 +185,6 @@ void upwindRoeAveraged(const dfloat nx,
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -275,7 +271,6 @@ void upwindRoeAveraged(const dfloat nx,
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -293,7 +288,6 @@ void upwindRoeAveraged(const dfloat nx,
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -304,7 +298,6 @@ void upwindRoeAveraged(const dfloat nx,
         }                                                               \
       }                                                                 \
     }                                                                   \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -322,7 +315,6 @@ void upwindRoeAveraged(const dfloat nx,
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -395,11 +387,9 @@ void upwindRoeAveraged(const dfloat nx,
       }
     }
 
-    @barrier("local");
 
     quadSurfaceTerms(0) //face 0
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -412,11 +402,9 @@ void upwindRoeAveraged(const dfloat nx,
       }
     }
 
-    @barrier("local");
 
     quadSurfaceTerms(5) //face 5
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -429,11 +417,9 @@ void upwindRoeAveraged(const dfloat nx,
       }
     }
 
-    @barrier("local");
 
     quadSurfaceTerms(1) //face 1
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -449,11 +435,9 @@ void upwindRoeAveraged(const dfloat nx,
       }
     }
 
-    @barrier("local");
 
     quadSurfaceTerms(3) //face 3
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -469,11 +453,9 @@ void upwindRoeAveraged(const dfloat nx,
       }
     }
 
-    @barrier("local");
 
     quadSurfaceTerms(2) //face 2
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -489,11 +471,9 @@ void upwindRoeAveraged(const dfloat nx,
       }
     }
 
-    @barrier("local");
 
     quadSurfaceTerms(4) //face 4
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -509,7 +489,6 @@ void upwindRoeAveraged(const dfloat nx,
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_cubNq;++j;@inner(1)){
       for(int i=0;i<p_cubNq;++i;@inner(0)){
diff --git a/solvers/cns/okl/cnsIsothermalCubatureSurfaceQuad2D.okl b/solvers/cns/okl/cnsIsothermalCubatureSurfaceQuad2D.okl
index 36de4242a..0c45f4ee4 100644
--- a/solvers/cns/okl/cnsIsothermalCubatureSurfaceQuad2D.okl
+++ b/solvers/cns/okl/cnsIsothermalCubatureSurfaceQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -208,7 +208,6 @@ void surfaceTerms(const int face,
       }
     }
 
-    @barrier("local");
 
 
     //interpolate traces, store flux in register
@@ -247,7 +246,6 @@ void surfaceTerms(const int face,
         }
     }
 
-    @barrier("local");
 
     //write fluxes to @shared
     for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -312,7 +310,6 @@ void surfaceTerms(const int face,
         }
     }
 
-    @barrier("local");
 
     // for all face nodes of all elements
     // face 0 & 2
@@ -329,7 +326,6 @@ void surfaceTerms(const int face,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_cubNq;++j;@inner(0)){
@@ -344,7 +340,6 @@ void surfaceTerms(const int face,
       }
     }
 
-    @barrier("local");
 
     for(int i=0;i<p_cubNq;++i;@inner(0)){
       if(i<p_Nq) {
diff --git a/solvers/cns/okl/cnsIsothermalCubatureSurfaceTet3D.okl b/solvers/cns/okl/cnsIsothermalCubatureSurfaceTet3D.okl
index 6a206729d..136242b30 100644
--- a/solvers/cns/okl/cnsIsothermalCubatureSurfaceTet3D.okl
+++ b/solvers/cns/okl/cnsIsothermalCubatureSurfaceTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -183,7 +183,6 @@ void upwindRoeAveraged(const dfloat nx,
           }
         }
 
-        @barrier("local");
 
         // interpolate to surface integration nodes
         for(int n=0;n<p_cubMaxNodes1;++n;@inner(0)){ // maxNodes = max(Nfp*Nfaces,Np)
@@ -300,7 +299,6 @@ void upwindRoeAveraged(const dfloat nx,
         }
 
         // wait for all @shared memory writes of the previous inner loop to complete
-        @barrier("local");
 
         // for each node in the element
         for(int n=0;n<p_cubMaxNodes1;++n;@inner(0)){
@@ -317,10 +315,8 @@ void upwindRoeAveraged(const dfloat nx,
           }
         }
 
-        @barrier("local");
       }
 
-    @barrier("local");
 
     // for each node in the element
     for(int n=0;n<p_cubMaxNodes1;++n;@inner(0)){
@@ -420,7 +416,6 @@ void upwindRoeAveraged(const dfloat nx,
       }
     }
 
-    @barrier("local");
 
     // interpolate to surface integration nodes
     for(int n=0;n<p_cubMaxNodes;++n;@inner(0)){ // maxNodes = max(Nfp*Nfaces,Np)
@@ -539,7 +534,6 @@ void upwindRoeAveraged(const dfloat nx,
     }
 
     // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
 
     // for each node in the element
     for(int n=0;n<p_cubMaxNodes;++n;@inner(0)){
diff --git a/solvers/cns/okl/cnsIsothermalCubatureSurfaceTri2D.okl b/solvers/cns/okl/cnsIsothermalCubatureSurfaceTri2D.okl
index a84ad05a0..ed280f19a 100644
--- a/solvers/cns/okl/cnsIsothermalCubatureSurfaceTri2D.okl
+++ b/solvers/cns/okl/cnsIsothermalCubatureSurfaceTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -151,7 +151,6 @@ void upwindRoeAveraged(const dfloat nx,
       }
     }
 
-    @barrier("local");
 
     // interpolate to surface integration nodes
     for(int n=0;n<p_cubMaxNodes;++n;@inner(0)){ // maxNodes = max(Nfp*Nfaces,Np)
@@ -235,7 +234,6 @@ void upwindRoeAveraged(const dfloat nx,
     }
 
     // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
 
     // for each node in the element
     for(int n=0;n<p_cubMaxNodes;++n;@inner(0)){
diff --git a/solvers/cns/okl/cnsIsothermalCubatureVolumeHex3D.okl b/solvers/cns/okl/cnsIsothermalCubatureVolumeHex3D.okl
index 53898f38a..cb8ba7f76 100644
--- a/solvers/cns/okl/cnsIsothermalCubatureVolumeHex3D.okl
+++ b/solvers/cns/okl/cnsIsothermalCubatureVolumeHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -90,7 +90,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //read in and interpolate in k
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -135,7 +134,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     #pragma unroll p_cubNq
     for(int k=0;k<p_cubNq;++k){
@@ -153,7 +151,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -179,7 +176,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -194,7 +190,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -218,7 +213,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
     }
 
     //#pragma unroll p_cubNq
@@ -309,7 +303,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -328,7 +321,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
     } //k loop
 
     #pragma unroll p_cubNq
@@ -342,7 +334,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -362,7 +353,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -374,7 +364,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -394,7 +383,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
     }
 
     //project in k and write out
diff --git a/solvers/cns/okl/cnsIsothermalCubatureVolumeQuad2D.okl b/solvers/cns/okl/cnsIsothermalCubatureVolumeQuad2D.okl
index db5ba19ae..7974ca8f1 100644
--- a/solvers/cns/okl/cnsIsothermalCubatureVolumeQuad2D.okl
+++ b/solvers/cns/okl/cnsIsothermalCubatureVolumeQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -99,7 +99,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //interpolate in i, store in register
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -124,7 +123,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -141,7 +139,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //interpolate in j and store flux in register
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -164,7 +161,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_cubNq;++j;@inner(1)){
       for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -220,7 +216,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //project/differentiate in j
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -244,7 +239,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -260,7 +254,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //project/differentiate in i and write back
     for(int j=0;j<p_cubNq;++j;@inner(1)){
diff --git a/solvers/cns/okl/cnsIsothermalCubatureVolumeTet3D.okl b/solvers/cns/okl/cnsIsothermalCubatureVolumeTet3D.okl
index d07e3d64f..55760f161 100644
--- a/solvers/cns/okl/cnsIsothermalCubatureVolumeTet3D.okl
+++ b/solvers/cns/okl/cnsIsothermalCubatureVolumeTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -91,7 +91,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_cubNp;++n;@inner(0)){
       //interpolate to cubature
@@ -181,7 +180,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_cubNp;++n;@inner(0)){
       if (n<p_Np) {
diff --git a/solvers/cns/okl/cnsIsothermalCubatureVolumeTri2D.okl b/solvers/cns/okl/cnsIsothermalCubatureVolumeTri2D.okl
index ad35ac22a..969dad82e 100644
--- a/solvers/cns/okl/cnsIsothermalCubatureVolumeTri2D.okl
+++ b/solvers/cns/okl/cnsIsothermalCubatureVolumeTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -82,7 +82,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_cubNp;++n;@inner(0)){
       //interpolate to cubature
@@ -122,7 +121,6 @@ SOFTWARE.
       s_G[2][n] = -(rv*v+p) + mu*T22;
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_cubNp;++n;@inner(0)){
       if (n<p_Np) {
diff --git a/solvers/cns/okl/cnsIsothermalSurfaceHex3D.okl b/solvers/cns/okl/cnsIsothermalSurfaceHex3D.okl
index 718e7cddb..5fe96ca16 100644
--- a/solvers/cns/okl/cnsIsothermalSurfaceHex3D.okl
+++ b/solvers/cns/okl/cnsIsothermalSurfaceHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -254,7 +254,8 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("global");
+    /*Need barriers because surfaceTerms writes to global*/
+    @barrier();
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(2)){
@@ -277,7 +278,7 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("global");
+    @barrier();
 
     // face 2 & 4
     for(int es=0;es<p_NblockS;++es;@inner(2)){
diff --git a/solvers/cns/okl/cnsIsothermalSurfaceQuad2D.okl b/solvers/cns/okl/cnsIsothermalSurfaceQuad2D.okl
index 5b9e08983..4d1b7126d 100644
--- a/solvers/cns/okl/cnsIsothermalSurfaceQuad2D.okl
+++ b/solvers/cns/okl/cnsIsothermalSurfaceQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -209,7 +209,6 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("local");
 
     // for all face nodes of all elements
     // face 0 & 2
@@ -233,7 +232,6 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -256,7 +254,6 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
diff --git a/solvers/cns/okl/cnsIsothermalSurfaceTet3D.okl b/solvers/cns/okl/cnsIsothermalSurfaceTet3D.okl
index 103b38ae6..03618bee1 100644
--- a/solvers/cns/okl/cnsIsothermalSurfaceTet3D.okl
+++ b/solvers/cns/okl/cnsIsothermalSurfaceTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -239,7 +239,6 @@ void upwindRoeAveraged(const dfloat nx,
     }
 
     // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
diff --git a/solvers/cns/okl/cnsIsothermalSurfaceTri2D.okl b/solvers/cns/okl/cnsIsothermalSurfaceTri2D.okl
index b0546bcea..f6f8cc569 100644
--- a/solvers/cns/okl/cnsIsothermalSurfaceTri2D.okl
+++ b/solvers/cns/okl/cnsIsothermalSurfaceTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -198,7 +198,6 @@ void upwindRoeAveraged(const dfloat nx,
     }
 
     // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
diff --git a/solvers/cns/okl/cnsIsothermalVolumeHex3D.okl b/solvers/cns/okl/cnsIsothermalVolumeHex3D.okl
index a1e5b3c76..acf99787e 100644
--- a/solvers/cns/okl/cnsIsothermalVolumeHex3D.okl
+++ b/solvers/cns/okl/cnsIsothermalVolumeHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -147,7 +147,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int k=0;k<p_Nq;++k;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
diff --git a/solvers/cns/okl/cnsIsothermalVolumeQuad2D.okl b/solvers/cns/okl/cnsIsothermalVolumeQuad2D.okl
index a06128827..0f75f18cd 100644
--- a/solvers/cns/okl/cnsIsothermalVolumeQuad2D.okl
+++ b/solvers/cns/okl/cnsIsothermalVolumeQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -112,7 +112,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
diff --git a/solvers/cns/okl/cnsIsothermalVolumeTet3D.okl b/solvers/cns/okl/cnsIsothermalVolumeTet3D.okl
index 3e5ee517a..e2a3ff1f4 100644
--- a/solvers/cns/okl/cnsIsothermalVolumeTet3D.okl
+++ b/solvers/cns/okl/cnsIsothermalVolumeTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -139,7 +139,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Np;++n;@inner(0)){
 
diff --git a/solvers/cns/okl/cnsIsothermalVolumeTri2D.okl b/solvers/cns/okl/cnsIsothermalVolumeTri2D.okl
index 96cf1e1c7..28fc792be 100644
--- a/solvers/cns/okl/cnsIsothermalVolumeTri2D.okl
+++ b/solvers/cns/okl/cnsIsothermalVolumeTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -106,7 +106,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Np;++n;@inner(0)){
 
diff --git a/solvers/cns/okl/cnsMaxWaveSpeedHex3D.okl b/solvers/cns/okl/cnsMaxWaveSpeedHex3D.okl
index 569ba2733..b69bcd643 100644
--- a/solvers/cns/okl/cnsMaxWaveSpeedHex3D.okl
+++ b/solvers/cns/okl/cnsMaxWaveSpeedHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/cns/okl/cnsMaxWaveSpeedQuad2D.okl b/solvers/cns/okl/cnsMaxWaveSpeedQuad2D.okl
index 7a5b1f606..7165da669 100644
--- a/solvers/cns/okl/cnsMaxWaveSpeedQuad2D.okl
+++ b/solvers/cns/okl/cnsMaxWaveSpeedQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -369,4 +369,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/cns/okl/cnsMaxWaveSpeedTet3D.okl b/solvers/cns/okl/cnsMaxWaveSpeedTet3D.okl
index 4aee881f5..004c2e4bf 100644
--- a/solvers/cns/okl/cnsMaxWaveSpeedTet3D.okl
+++ b/solvers/cns/okl/cnsMaxWaveSpeedTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -409,4 +409,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/cns/okl/cnsMaxWaveSpeedTri2D.okl b/solvers/cns/okl/cnsMaxWaveSpeedTri2D.okl
index 57b6ad01e..c56d100e0 100644
--- a/solvers/cns/okl/cnsMaxWaveSpeedTri2D.okl
+++ b/solvers/cns/okl/cnsMaxWaveSpeedTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -381,4 +381,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/cns/okl/cnsSurfaceHex3D.okl b/solvers/cns/okl/cnsSurfaceHex3D.okl
index 266932cf6..7e85f7319 100644
--- a/solvers/cns/okl/cnsSurfaceHex3D.okl
+++ b/solvers/cns/okl/cnsSurfaceHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -274,7 +274,8 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("global");
+    /*Need barriers because surfaceTerms writes to global*/
+    @barrier();
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(2)){
@@ -297,7 +298,7 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("global");
+    @barrier();
 
     // face 2 & 4
     for(int es=0;es<p_NblockS;++es;@inner(2)){
diff --git a/solvers/cns/okl/cnsSurfaceQuad2D.okl b/solvers/cns/okl/cnsSurfaceQuad2D.okl
index 2cfb8a209..cf81f3e81 100644
--- a/solvers/cns/okl/cnsSurfaceQuad2D.okl
+++ b/solvers/cns/okl/cnsSurfaceQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -232,7 +232,6 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("local");
 
     // for all face nodes of all elements
     // face 0 & 2
@@ -256,7 +255,6 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -279,7 +277,6 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
diff --git a/solvers/cns/okl/cnsSurfaceQuad3D.okl b/solvers/cns/okl/cnsSurfaceQuad3D.okl
index 86b9133c2..403456448 100644
--- a/solvers/cns/okl/cnsSurfaceQuad3D.okl
+++ b/solvers/cns/okl/cnsSurfaceQuad3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,21 +27,21 @@ SOFTWARE.
 
 
 void upwindRoeAveraged(const dfloat nx,
-		       const dfloat ny,
-		       const dfloat nz,
-		       const dfloat rM,
-		       const dfloat ruM,
-		       const dfloat rvM,
-		       const dfloat rwM,       
-		       const dfloat rP,
-		       const dfloat ruP,
-		       const dfloat rvP,
-		       const dfloat rwP,
-		       dfloat *rflux,
-		       dfloat *ruflux,
-		       dfloat *rvflux,
-		       dfloat *rwflux){
-  
+                       const dfloat ny,
+                       const dfloat nz,
+                       const dfloat rM,
+                       const dfloat ruM,
+                       const dfloat rvM,
+                       const dfloat rwM,
+                       const dfloat rP,
+                       const dfloat ruP,
+                       const dfloat rvP,
+                       const dfloat rwP,
+                       dfloat *rflux,
+                       dfloat *ruflux,
+                       dfloat *rvflux,
+                       dfloat *rwflux){
+
   dfloat sqrtrM = sqrt(rM);
   dfloat sqrtrP = sqrt(rP);
 
@@ -58,7 +58,7 @@ void upwindRoeAveraged(const dfloat nx,
   dfloat wP = rwP/rP;
   dfloat pP = rP*p_RT;
   dfloat qnP = nx*uP+ny*vP+nz*wP;
-  
+
   // Roe averaged variables
   dfloat r  = sqrtrM*sqrtrP;
   dfloat u  = (uM*sqrtrM + uP*sqrtrP)/(sqrtrM + sqrtrP);
@@ -122,110 +122,110 @@ void upwindRoeAveraged(const dfloat nx,
 }
 
 
-void surfaceTerms(const int e, 
-                  const int es, 
-                  const int sk, 
-                  const int face, 
-                  const int i, 
+void surfaceTerms(const int e,
+                  const int es,
+                  const int sk,
+                  const int face,
+                  const int i,
                   const int j,
                   const dfloat time,
-		  const dfloat mu, 
-                  const dfloat intfx, 
+                  const dfloat mu,
+                  const dfloat intfx,
                   const dfloat intfy,
-		  const dfloat intfz, 
+                  const dfloat intfz,
                   const int advSwitch,
-                  @global const dfloat *x, 
+                  @global const dfloat *x,
                   @global const dfloat *y,
-		  @global const dfloat *z, 
-                  @global const dfloat *sgeo, 
-                  @global const int *vmapM, 
-                  @global const int *vmapP, 
-		  @global const int *EToB,
-                  @global const dfloat *q, 
+                  @global const dfloat *z,
+                  @global const dfloat *sgeo,
+                  @global const int *vmapM,
+                  @global const int *vmapP,
+                  @global const int *EToB,
+                  @global const dfloat *q,
                   @global const dfloat *viscousStresses,
                   dfloat s_rflux [p_NblockS][p_Nq][p_Nq],
                   dfloat s_ruflux [p_NblockS][p_Nq][p_Nq],
                   dfloat s_rvflux [p_NblockS][p_Nq][p_Nq],
-		  dfloat s_rwflux [p_NblockS][p_Nq][p_Nq]){
-  
-  const dfloat nx = sgeo[sk*p_Nsgeo+p_NXID];                            
+                  dfloat s_rwflux [p_NblockS][p_Nq][p_Nq]){
+
+  const dfloat nx = sgeo[sk*p_Nsgeo+p_NXID];
   const dfloat ny = sgeo[sk*p_Nsgeo+p_NYID];
-  const dfloat nz = sgeo[sk*p_Nsgeo+p_NZID];                            
-  const dfloat sJ = sgeo[sk*p_Nsgeo+p_SJID];                            
-  const dfloat invWJ = sgeo[sk*p_Nsgeo+p_WIJID];                        
-                                                                        
-  const dlong idM = vmapM[sk];                                  
-  const dlong idP = vmapP[sk];                                  
-  
-  const dlong eM = e;                                                   
-  const dlong eP = idP/p_Np;                                            
-  const int vidM = idM%p_Np;                                            
-  const int vidP = idP%p_Np;                                            
-  
-  const dlong qbaseM = eM*p_Np*p_Nfields + vidM;                        
-  const dlong qbaseP = eP*p_Np*p_Nfields + vidP;                        
-  
-  const dlong sbaseM = eM*p_Np*p_Nstresses + vidM;                      
-  const dlong sbaseP = eP*p_Np*p_Nstresses + vidP;                      
-  
-  const dfloat rM  = q[qbaseM + 0*p_Np];                                
-  const dfloat ruM = q[qbaseM + 1*p_Np];                                
+  const dfloat nz = sgeo[sk*p_Nsgeo+p_NZID];
+  const dfloat sJ = sgeo[sk*p_Nsgeo+p_SJID];
+  const dfloat invWJ = sgeo[sk*p_Nsgeo+p_WIJID];
+
+  const dlong idM = vmapM[sk];
+  const dlong idP = vmapP[sk];
+
+  const dlong eM = e;
+  const dlong eP = idP/p_Np;
+  const int vidM = idM%p_Np;
+  const int vidP = idP%p_Np;
+
+  const dlong qbaseM = eM*p_Np*p_Nfields + vidM;
+  const dlong qbaseP = eP*p_Np*p_Nfields + vidP;
+
+  const dlong sbaseM = eM*p_Np*p_Nstresses + vidM;
+  const dlong sbaseP = eP*p_Np*p_Nstresses + vidP;
+
+  const dfloat rM  = q[qbaseM + 0*p_Np];
+  const dfloat ruM = q[qbaseM + 1*p_Np];
   const dfloat rvM = q[qbaseM + 2*p_Np];
-  const dfloat rwM = q[qbaseM + 3*p_Np];                                
-  
-  const dfloat T11M = viscousStresses[sbaseM+0*p_Np];                   
+  const dfloat rwM = q[qbaseM + 3*p_Np];
+
+  const dfloat T11M = viscousStresses[sbaseM+0*p_Np];
   const dfloat T12M = viscousStresses[sbaseM+1*p_Np];
-  const dfloat T13M = viscousStresses[sbaseM+2*p_Np];                   
+  const dfloat T13M = viscousStresses[sbaseM+2*p_Np];
   const dfloat T22M = viscousStresses[sbaseM+3*p_Np];
   const dfloat T23M = viscousStresses[sbaseM+4*p_Np];
-  const dfloat T33M = viscousStresses[sbaseM+5*p_Np];                   
-  
-  dfloat rP  = q[qbaseP + 0*p_Np];                                      
-  dfloat ruP = q[qbaseP + 1*p_Np];                                      
+  const dfloat T33M = viscousStresses[sbaseM+5*p_Np];
+
+  dfloat rP  = q[qbaseP + 0*p_Np];
+  dfloat ruP = q[qbaseP + 1*p_Np];
   dfloat rvP = q[qbaseP + 2*p_Np];
-  dfloat rwP = q[qbaseP + 3*p_Np];                                      
-  
-  const dfloat T11P = viscousStresses[sbaseP+0*p_Np];                   
+  dfloat rwP = q[qbaseP + 3*p_Np];
+
+  const dfloat T11P = viscousStresses[sbaseP+0*p_Np];
   const dfloat T12P = viscousStresses[sbaseP+1*p_Np];
-  const dfloat T13P = viscousStresses[sbaseP+2*p_Np];                   
+  const dfloat T13P = viscousStresses[sbaseP+2*p_Np];
   const dfloat T22P = viscousStresses[sbaseP+3*p_Np];
   const dfloat T23P = viscousStresses[sbaseP+4*p_Np];
-  const dfloat T33P = viscousStresses[sbaseP+5*p_Np];                   
-  
-  const dfloat uM = ruM/rM;                                             
+  const dfloat T33P = viscousStresses[sbaseP+5*p_Np];
+
+  const dfloat uM = ruM/rM;
   const dfloat vM = rvM/rM;
-  const dfloat wM = rwM/rM;                                             
-  const dfloat pM = p_RT*rM;                                            
-  
-  dfloat uP = ruP/rP;                                                   
+  const dfloat wM = rwM/rM;
+  const dfloat pM = p_RT*rM;
+
+  dfloat uP = ruP/rP;
   dfloat vP = rvP/rP;
-  dfloat wP = rwP/rP;                                                   
-  dfloat pP = p_RT*rP;                                          
-  
-  const dfloat sc = invWJ*sJ;                                           
-  
-  dfloat rflux, ruflux, rvflux, rwflux;                                 
-  upwindRoeAveraged (nx, ny, nz, rM, ruM, rvM, rwM, rP, ruP, rvP, rwP, &rflux, &ruflux, &rvflux, &rwflux); 
-
-  rflux  *= advSwitch;                                                   
-  ruflux *= advSwitch;                                          
+  dfloat wP = rwP/rP;
+  dfloat pP = p_RT*rP;
+
+  const dfloat sc = invWJ*sJ;
+
+  dfloat rflux, ruflux, rvflux, rwflux;
+  upwindRoeAveraged (nx, ny, nz, rM, ruM, rvM, rwM, rP, ruP, rvP, rwP, &rflux, &ruflux, &rvflux, &rwflux);
+
+  rflux  *= advSwitch;
+  ruflux *= advSwitch;
   rvflux *= advSwitch;
-  rwflux *= advSwitch;                                          
-  
-  ruflux -= p_half*(nx*(T11P+T11M) + ny*(T12P+T12M) + nz*(T13P+T13M));                   
+  rwflux *= advSwitch;
+
+  ruflux -= p_half*(nx*(T11P+T11M) + ny*(T12P+T12M) + nz*(T13P+T13M));
   rvflux -= p_half*(nx*(T12P+T12M) + ny*(T22P+T22M) + nz*(T23P+T23M));
   rwflux -= p_half*(nx*(T13P+T13M) + ny*(T23P+T23M) + nz*(T33P+T33M));
 
-  const dfloat penalty = mu*sgeo[sk*p_Nsgeo+p_IHID]*(p_Nq)*(p_Nq-1)*p_half;                        
+  const dfloat penalty = mu*sgeo[sk*p_Nsgeo+p_IHID]*(p_Nq)*(p_Nq-1)*p_half;
 
   ruflux -= penalty*(ruP-ruM);
   rvflux -= penalty*(rvP-rvM);
   rwflux -= penalty*(rwP-rwM);
-  
-  s_rflux [es][j][i] += sc*(-rflux);                                    
-  s_ruflux[es][j][i] += sc*(-ruflux);                                  
+
+  s_rflux [es][j][i] += sc*(-rflux);
+  s_ruflux[es][j][i] += sc*(-ruflux);
   s_rvflux[es][j][i] += sc*(-rvflux);
-  s_rwflux[es][j][i] += sc*(-rwflux);                                   
+  s_rwflux[es][j][i] += sc*(-rwflux);
 }
 
 // batch process elements
@@ -239,18 +239,18 @@ void surfaceTerms(const int e,
                              const dfloat time,
                              @restrict const  dfloat *  x,
                              @restrict const  dfloat *  y,
-                             @restrict const  dfloat *  z, 
+                             @restrict const  dfloat *  z,
                              const dfloat mu,
                              const dfloat intfx,
                              const dfloat intfy,
-                             const dfloat intfz, 
+                             const dfloat intfz,
                              @restrict const  dfloat *  q,
                              @restrict const  dfloat *  viscousStresses,
                              @restrict dfloat *  rhsq){
-  
+
   // for all elements
   for(dlong eo=0;eo<Nelements;eo+=p_NblockS;@outer(0)){
-    
+
     // @shared storage for flux terms
     @shared dfloat s_rflux [p_NblockS][p_Nq][p_Nq];
     @shared dfloat s_ruflux[p_NblockS][p_Nq][p_Nq];
@@ -264,12 +264,11 @@ void surfaceTerms(const int e,
             s_rflux [es][j][i] = 0.;
             s_ruflux[es][j][i] = 0.;
             s_rvflux[es][j][i] = 0.;
-	    s_rwflux[es][j][i] = 0.;
+            s_rwflux[es][j][i] = 0.;
           }
       }
     }
 
-    @barrier("local");
 
     // for all face nodes of all elements
     // face 0 & 2
@@ -283,7 +282,7 @@ void surfaceTerms(const int e,
           // surfaceTerms(sk0,0,i,0     );
           surfaceTerms(e, es, sk0, 0, i, 0,
                        time, mu, intfx, intfy, intfz, advSwitch, x, y, z, sgeo, vmapM, vmapP, EToB, q, viscousStresses, s_rflux, s_ruflux, s_rvflux, s_rwflux);
-          
+
           //          surfaceTerms(sk2,2,i,p_Nq-1);
           surfaceTerms(e, es, sk2, 2, i, p_Nq-1,
                        time, mu, intfx, intfy, intfz, advSwitch, x, y, z, sgeo, vmapM, vmapP, EToB, q, viscousStresses, s_rflux, s_ruflux, s_rvflux, s_rwflux);
@@ -291,7 +290,6 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -312,7 +310,6 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -325,7 +322,7 @@ void surfaceTerms(const int e,
               rhsq[base+0*p_Np] += s_rflux [es][j][i];
               rhsq[base+1*p_Np] += s_ruflux[es][j][i];
               rhsq[base+2*p_Np] += s_rvflux[es][j][i];
-	      rhsq[base+3*p_Np] += s_rwflux[es][j][i];
+              rhsq[base+3*p_Np] += s_rwflux[es][j][i];
             }
         }
       }
@@ -333,68 +330,68 @@ void surfaceTerms(const int e,
   }
 }
 
-void stressSurfaceTerms(const int e, 
-                        const int es, 
-                        const int sk, 
-                        const int face, 
-                        const int i, 
+void stressSurfaceTerms(const int e,
+                        const int es,
+                        const int sk,
+                        const int face,
+                        const int i,
                         const int j,
-                        const dfloat time, 
-                        const dfloat mu, 
-                        const dfloat intfx, 
+                        const dfloat time,
+                        const dfloat mu,
+                        const dfloat intfx,
                         const dfloat intfy,
-			const dfloat intfz, 
-                        @global const dfloat *x, 
+                        const dfloat intfz,
+                        @global const dfloat *x,
                         @global const dfloat *y,
-			@global const dfloat *z, 
-                        @global const dfloat *sgeo, 
-                        @global const int *vmapM, 
-                        @global const int *vmapP, 
+                        @global const dfloat *z,
+                        @global const dfloat *sgeo,
+                        @global const int *vmapM,
+                        @global const int *vmapP,
                         @global const int *EToB,
-                        @global const dfloat *q, 
+                        @global const dfloat *q,
                         @global const dfloat *viscousStresses,
                         dfloat s_T11flux [p_NblockS][p_Nq][p_Nq],
                         dfloat s_T12flux [p_NblockS][p_Nq][p_Nq],
-			dfloat s_T13flux [p_NblockS][p_Nq][p_Nq],
+                        dfloat s_T13flux [p_NblockS][p_Nq][p_Nq],
                         dfloat s_T22flux [p_NblockS][p_Nq][p_Nq],
-			dfloat s_T23flux [p_NblockS][p_Nq][p_Nq],
-			dfloat s_T33flux [p_NblockS][p_Nq][p_Nq]
-			){
-  
-    const dfloat nx = sgeo[sk*p_Nsgeo+p_NXID];                          
+                        dfloat s_T23flux [p_NblockS][p_Nq][p_Nq],
+                        dfloat s_T33flux [p_NblockS][p_Nq][p_Nq]
+                        ){
+
+    const dfloat nx = sgeo[sk*p_Nsgeo+p_NXID];
     const dfloat ny = sgeo[sk*p_Nsgeo+p_NYID];
-    const dfloat nz = sgeo[sk*p_Nsgeo+p_NZID];                          
-    const dfloat sJ = sgeo[sk*p_Nsgeo+p_SJID];                          
-    const dfloat invWJ = sgeo[sk*p_Nsgeo+p_WIJID];                      
-                                                                        
-    const dlong idM = vmapM[sk];                                        
-    const dlong idP = vmapP[sk];                                        
-                                                                        
-    const dlong eM = e;                                                 
-    const dlong eP = idP/p_Np;                                          
-    const int vidM = idM%p_Np;                                          
-    const int vidP = idP%p_Np;                                          
-                                                                        
-    const dlong baseM = eM*p_Np*p_Nfields + vidM;                       
-    const dlong baseP = eP*p_Np*p_Nfields + vidP;                       
-                                                                        
-    const dfloat rM  = q[baseM + 0*p_Np];                               
-    const dfloat ruM = q[baseM + 1*p_Np];                               
+    const dfloat nz = sgeo[sk*p_Nsgeo+p_NZID];
+    const dfloat sJ = sgeo[sk*p_Nsgeo+p_SJID];
+    const dfloat invWJ = sgeo[sk*p_Nsgeo+p_WIJID];
+
+    const dlong idM = vmapM[sk];
+    const dlong idP = vmapP[sk];
+
+    const dlong eM = e;
+    const dlong eP = idP/p_Np;
+    const int vidM = idM%p_Np;
+    const int vidP = idP%p_Np;
+
+    const dlong baseM = eM*p_Np*p_Nfields + vidM;
+    const dlong baseP = eP*p_Np*p_Nfields + vidP;
+
+    const dfloat rM  = q[baseM + 0*p_Np];
+    const dfloat ruM = q[baseM + 1*p_Np];
     const dfloat rvM = q[baseM + 2*p_Np];
-    const dfloat rwM = q[baseM + 3*p_Np];                               
-                                                                        
-    dfloat uM = ruM/rM;                                                 
+    const dfloat rwM = q[baseM + 3*p_Np];
+
+    dfloat uM = ruM/rM;
     dfloat vM = rvM/rM;
-    dfloat wM = rwM/rM;                                                 
-                                                                        
-    dfloat rP  = q[baseP + 0*p_Np];                                     
-    dfloat ruP = q[baseP + 1*p_Np];                                     
+    dfloat wM = rwM/rM;
+
+    dfloat rP  = q[baseP + 0*p_Np];
+    dfloat ruP = q[baseP + 1*p_Np];
     dfloat rvP = q[baseP + 2*p_Np];
-    dfloat rwP = q[baseP + 3*p_Np];                                     
-                                                                        
-    dfloat uP = ruP/rP;                                                 
+    dfloat rwP = q[baseP + 3*p_Np];
+
+    dfloat uP = ruP/rP;
     dfloat vP = rvP/rP;
-    dfloat wP = rwP/rP;                                                 
+    dfloat wP = rwP/rP;
 
     const dfloat ndotdV = nx*(uP-uM)+ny*(vP-vM)+nz*(wP-wM);
 
@@ -404,34 +401,34 @@ void stressSurfaceTerms(const int e,
 
     const dfloat dS12 = p_half*(ny*(uP-uM) + nx*(vP-vM));
     const dfloat dS13 = p_half*(nz*(uP-uM) + nx*(wP-wM));
-    const dfloat dS23 = p_half*(nz*(vP-vM) + ny*(wP-wM));      
-    
-    const dfloat sc = invWJ * sJ;                                       
-    s_T11flux[es][j][i] += sc*p_two*mu*dS11;                            
+    const dfloat dS23 = p_half*(nz*(vP-vM) + ny*(wP-wM));
+
+    const dfloat sc = invWJ * sJ;
+    s_T11flux[es][j][i] += sc*p_two*mu*dS11;
     s_T12flux[es][j][i] += sc*p_two*mu*dS12;
-    s_T13flux[es][j][i] += sc*p_two*mu*dS13;                            
+    s_T13flux[es][j][i] += sc*p_two*mu*dS13;
     s_T22flux[es][j][i] += sc*p_two*mu*dS22;
     s_T23flux[es][j][i] += sc*p_two*mu*dS23;
-    s_T33flux[es][j][i] += sc*p_two*mu*dS33;                            
+    s_T33flux[es][j][i] += sc*p_two*mu*dS33;
   }
 
 @kernel void cnsStressesSurfaceQuad3D(const int Nelements,
-				      @restrict const  dfloat *  sgeo,
-				      @restrict const  dfloat *  LIFTT,
-				      @restrict const  int   *  vmapM,
-				      @restrict const  int   *  vmapP,
-				      @restrict const  int   *  EToB,
-				      const dfloat time,
-				      @restrict const  dfloat *  x,
-				      @restrict const  dfloat *  y,
-				      @restrict const  dfloat *  z,
-				      const dfloat mu,
-				      const dfloat intfx,
-				      const dfloat intfy,
-				      const dfloat intfz, 
-				      @restrict const  dfloat *  q,
-				      @restrict dfloat *  viscousStresses){
-  
+                                      @restrict const  dfloat *  sgeo,
+                                      @restrict const  dfloat *  LIFTT,
+                                      @restrict const  int   *  vmapM,
+                                      @restrict const  int   *  vmapP,
+                                      @restrict const  int   *  EToB,
+                                      const dfloat time,
+                                      @restrict const  dfloat *  x,
+                                      @restrict const  dfloat *  y,
+                                      @restrict const  dfloat *  z,
+                                      const dfloat mu,
+                                      const dfloat intfx,
+                                      const dfloat intfy,
+                                      const dfloat intfz,
+                                      @restrict const  dfloat *  q,
+                                      @restrict dfloat *  viscousStresses){
+
   // for all elements
   for(dlong eo=0;eo<Nelements;eo+=p_NblockS;@outer(0)){
     // @shared storage for flux terms
@@ -448,15 +445,14 @@ void stressSurfaceTerms(const int e,
           for(int j=0;j<p_Nq;++j){
             s_T11flux[es][j][i] = 0.;
             s_T12flux[es][j][i] = 0.;
-	    s_T13flux[es][j][i] = 0.;
+            s_T13flux[es][j][i] = 0.;
             s_T22flux[es][j][i] = 0.;
-	    s_T23flux[es][j][i] = 0.;
-	    s_T33flux[es][j][i] = 0.;
+            s_T23flux[es][j][i] = 0.;
+            s_T33flux[es][j][i] = 0.;
           }
       }
     }
 
-    @barrier("local");
 
     // for all face nodes of all elements
     // face 0 & 2
@@ -468,18 +464,17 @@ void stressSurfaceTerms(const int e,
           const dlong sk2 = e*p_Nfp*p_Nfaces + 2*p_Nfp + i;
 
           //          stressSurfaceTerms(sk0,0,i,0     );
-          stressSurfaceTerms(e, es, sk0, 0, i, 0, 
-                             time, mu, intfx, intfy, intfz, x, y, z, sgeo, vmapM, vmapP, EToB, q, viscousStresses, s_T11flux, s_T12flux, s_T13flux, s_T22flux, s_T23flux, s_T33flux); 
+          stressSurfaceTerms(e, es, sk0, 0, i, 0,
+                             time, mu, intfx, intfy, intfz, x, y, z, sgeo, vmapM, vmapP, EToB, q, viscousStresses, s_T11flux, s_T12flux, s_T13flux, s_T22flux, s_T23flux, s_T33flux);
 
           //          stressSurfaceTerms(sk2,2,i,p_Nq-1);
-          stressSurfaceTerms(e, es, sk2, 2, i, p_Nq-1, 
+          stressSurfaceTerms(e, es, sk2, 2, i, p_Nq-1,
                              time, mu, intfx, intfy, intfz, x, y, z, sgeo, vmapM, vmapP, EToB, q, viscousStresses, s_T11flux, s_T12flux, s_T13flux, s_T22flux, s_T23flux, s_T33flux);
-          
+
         }
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -490,17 +485,16 @@ void stressSurfaceTerms(const int e,
           const dlong sk3 = e*p_Nfp*p_Nfaces + 3*p_Nfp + j;
 
           //          stressSurfaceTerms(sk1,1,p_Nq-1,j);
-          stressSurfaceTerms(e, es, sk1, 1, p_Nq-1, j, 
+          stressSurfaceTerms(e, es, sk1, 1, p_Nq-1, j,
                              time, mu, intfx, intfy, intfz, x, y, z, sgeo, vmapM, vmapP, EToB, q, viscousStresses, s_T11flux, s_T12flux, s_T13flux, s_T22flux, s_T23flux, s_T33flux);
 
           //stressSurfaceTerms(sk3,3,0     ,j);
-          stressSurfaceTerms(e, es, sk3, 3, 0, j, 
+          stressSurfaceTerms(e, es, sk3, 3, 0, j,
                              time, mu, intfx, intfy, intfz, x, y, z, sgeo, vmapM, vmapP, EToB, q, viscousStresses, s_T11flux, s_T12flux, s_T13flux, s_T22flux, s_T23flux, s_T33flux);
         }
       }
     }
 
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -512,10 +506,10 @@ void stressSurfaceTerms(const int e,
               const dlong base = e*p_Np*p_Nstresses+j*p_Nq+i;
               viscousStresses[base+0*p_Np] += s_T11flux[es][j][i];
               viscousStresses[base+1*p_Np] += s_T12flux[es][j][i];
-	      viscousStresses[base+2*p_Np] += s_T13flux[es][j][i];
+              viscousStresses[base+2*p_Np] += s_T13flux[es][j][i];
               viscousStresses[base+3*p_Np] += s_T22flux[es][j][i];
-	      viscousStresses[base+4*p_Np] += s_T23flux[es][j][i];
-	      viscousStresses[base+5*p_Np] += s_T33flux[es][j][i];
+              viscousStresses[base+4*p_Np] += s_T23flux[es][j][i];
+              viscousStresses[base+5*p_Np] += s_T33flux[es][j][i];
             }
         }
       }
diff --git a/solvers/cns/okl/cnsSurfaceTet3D.okl b/solvers/cns/okl/cnsSurfaceTet3D.okl
index 75134c840..6a979b474 100644
--- a/solvers/cns/okl/cnsSurfaceTet3D.okl
+++ b/solvers/cns/okl/cnsSurfaceTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -254,7 +254,6 @@ void upwindRoeAveraged(const dfloat nx,
     }
 
     // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
 
     // for each node in the element
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
diff --git a/solvers/cns/okl/cnsSurfaceTri2D.okl b/solvers/cns/okl/cnsSurfaceTri2D.okl
index b2ec528e3..1bee51616 100644
--- a/solvers/cns/okl/cnsSurfaceTri2D.okl
+++ b/solvers/cns/okl/cnsSurfaceTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -219,7 +219,6 @@ void upwindRoeAveraged(const dfloat nx,
     }
 
     // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
diff --git a/solvers/cns/okl/cnsVolumeHex3D.okl b/solvers/cns/okl/cnsVolumeHex3D.okl
index 71e795a38..53443a76f 100644
--- a/solvers/cns/okl/cnsVolumeHex3D.okl
+++ b/solvers/cns/okl/cnsVolumeHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -160,7 +160,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int k=0;k<p_Nq;++k;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
diff --git a/solvers/cns/okl/cnsVolumeQuad2D.okl b/solvers/cns/okl/cnsVolumeQuad2D.okl
index eb579f021..0d22c62be 100644
--- a/solvers/cns/okl/cnsVolumeQuad2D.okl
+++ b/solvers/cns/okl/cnsVolumeQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -120,7 +120,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -152,4 +151,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/cns/okl/cnsVolumeQuad3D.okl b/solvers/cns/okl/cnsVolumeQuad3D.okl
index 77227d43e..c84a184af 100644
--- a/solvers/cns/okl/cnsVolumeQuad3D.okl
+++ b/solvers/cns/okl/cnsVolumeQuad3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,19 +27,19 @@ SOFTWARE.
 
 // isothermal Compressible Navier-Stokes
 @kernel void cnsVolumeQuad3D(const dlong Nelements,
-			     const int advSwitch,
-			     const dfloat fx,
-			     const dfloat fy,
-			     const dfloat fz, 
-			     @restrict const  dfloat *  vgeo,
-			     @restrict const  dfloat *  x,
-			     @restrict const  dfloat *  y,
-			     @restrict const  dfloat *  z,
-			     @restrict const  dfloat *  D,
-			     @restrict const  dfloat *  viscousStresses,
-			     @restrict const  dfloat *  q,
-			     @restrict dfloat *  rhsq){
-  
+                             const int advSwitch,
+                             const dfloat fx,
+                             const dfloat fy,
+                             const dfloat fz,
+                             @restrict const  dfloat *  vgeo,
+                             @restrict const  dfloat *  x,
+                             @restrict const  dfloat *  y,
+                             @restrict const  dfloat *  z,
+                             @restrict const  dfloat *  D,
+                             @restrict const  dfloat *  viscousStresses,
+                             @restrict const  dfloat *  q,
+                             @restrict dfloat *  rhsq){
+
   for(dlong e=0;e<Nelements;++e;@outer(0)){
 
     @shared dfloat s_D[p_Nq][p_Nq];
@@ -48,7 +48,7 @@ SOFTWARE.
     @shared dfloat s_H[p_Nfields][p_Nq][p_Nq];
 
     @exclusive dfloat r, ru, rv, rw;
-    
+
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
         s_D[j][i] = D[j*p_Nq+i];
@@ -57,25 +57,25 @@ SOFTWARE.
         const dlong gbase = e*p_Np*p_Nvgeo + j*p_Nq + i;
         const dfloat rx = vgeo[gbase+p_Np*p_RXID];
         const dfloat ry = vgeo[gbase+p_Np*p_RYID];
-	const dfloat rz = vgeo[gbase+p_Np*p_RZID];
+        const dfloat rz = vgeo[gbase+p_Np*p_RZID];
 
         const dfloat sx = vgeo[gbase+p_Np*p_SXID];
         const dfloat sy = vgeo[gbase+p_Np*p_SYID];
-	const dfloat sz = vgeo[gbase+p_Np*p_SZID];
+        const dfloat sz = vgeo[gbase+p_Np*p_SZID];
 
         const dfloat tx = vgeo[gbase+p_Np*p_TXID];
         const dfloat ty = vgeo[gbase+p_Np*p_TYID];
-	const dfloat tz = vgeo[gbase+p_Np*p_TZID];
-	
+        const dfloat tz = vgeo[gbase+p_Np*p_TZID];
+
         const dfloat JW = vgeo[gbase+p_Np*p_JWID];
 
         // conserved variables
         const dlong  qbase = e*p_Np*p_Nfields + j*p_Nq + i;
 
-	r  = q[qbase+0*p_Np];
+        r  = q[qbase+0*p_Np];
         ru = q[qbase+1*p_Np];
         rv = q[qbase+2*p_Np];
-	rw = q[qbase+3*p_Np];
+        rw = q[qbase+3*p_Np];
 
         const dfloat p  = r*p_RT;
 
@@ -86,71 +86,70 @@ SOFTWARE.
         const dlong id = e*p_Np*p_Nstresses + j*p_Nq + i;
         const dfloat T11 = viscousStresses[id+0*p_Np];
         const dfloat T12 = viscousStresses[id+1*p_Np];
-	const dfloat T13 = viscousStresses[id+2*p_Np];
+        const dfloat T13 = viscousStresses[id+2*p_Np];
         const dfloat T22 = viscousStresses[id+3*p_Np];
-	const dfloat T23 = viscousStresses[id+4*p_Np];
-	const dfloat T33 = viscousStresses[id+5*p_Np];
-        
+        const dfloat T23 = viscousStresses[id+4*p_Np];
+        const dfloat T33 = viscousStresses[id+5*p_Np];
+
         // (1/J) \hat{div} (G*[F;G])
 
         {
           // F0 = ru, G0 = rv
           const dfloat f = -advSwitch*ru;
           const dfloat g = -advSwitch*rv;
-	  const dfloat h = -advSwitch*rw;
+          const dfloat h = -advSwitch*rw;
 
           s_F[0][j][i] = JW*(rx*f + ry*g + rz*h);
           s_G[0][j][i] = JW*(sx*f + sy*g + sz*h);
-	  s_H[0][j][i] = JW*(tx*f + ty*g + tz*h);
+          s_H[0][j][i] = JW*(tx*f + ty*g + tz*h);
         }
 
-	//	rz*JW*(-advSwitch*rw)
-	// df/dx + dg/dy  + dg/dz
-	// r,s stuff + tx*f + 
-	
+        //      rz*JW*(-advSwitch*rw)
+        // df/dx + dg/dy  + dg/dz
+        // r,s stuff + tx*f +
+
         {
           // F1 = 2*mu*S11 - (ru^2+p), G1 = 2*mu*S12 - (rvu)
           const dfloat f = T11-advSwitch*(ru*u+p);
           const dfloat g = T12-advSwitch*(rv*u);
-	  const dfloat h = T13-advSwitch*(rw*u);
+          const dfloat h = T13-advSwitch*(rw*u);
           s_F[1][j][i] = JW*(rx*f + ry*g + rz*h);
           s_G[1][j][i] = JW*(sx*f + sy*g + sz*h);
-	  s_H[1][j][i] = JW*(tx*f + ty*g + tz*h);
+          s_H[1][j][i] = JW*(tx*f + ty*g + tz*h);
         }
 
         {
           // F2 = 2*mu*S21 - (ruv), G2 = 2*mu*S22 - (rv^2+p)
           const dfloat f = T12-advSwitch*(ru*v);
           const dfloat g = T22-advSwitch*(rv*v+p);
-	  const dfloat h = T23-advSwitch*(rw*v);
+          const dfloat h = T23-advSwitch*(rw*v);
           s_F[2][j][i] = JW*(rx*f + ry*g + rz*h);
           s_G[2][j][i] = JW*(sx*f + sy*g + sz*h);
-	  s_H[2][j][i] = JW*(tx*f + ty*g + tz*h);
+          s_H[2][j][i] = JW*(tx*f + ty*g + tz*h);
         }
 
         {
           const dfloat f = T13-advSwitch*(ru*w);
           const dfloat g = T23-advSwitch*(rv*w);
-	  const dfloat h = T33-advSwitch*(rw*w+p);
-          s_F[3][j][i] = JW*(rx*f + ry*g + rz*h); 
+          const dfloat h = T33-advSwitch*(rw*w+p);
+          s_F[3][j][i] = JW*(rx*f + ry*g + rz*h);
           s_G[3][j][i] = JW*(sx*f + sy*g + sz*h);
-	  s_H[3][j][i] = JW*(tx*f + ty*g + tz*h);
+          s_H[3][j][i] = JW*(tx*f + ty*g + tz*h);
         }
       }
     }
 
-    @barrier("local");
-    
+
     for(int j=0;j<p_Nq;++j;@inner(1)){
-      for(int i=0;i<p_Nq;++i;@inner(0)){    
+      for(int i=0;i<p_Nq;++i;@inner(0)){
         const dlong gid = e*p_Np*p_Nvgeo+ j*p_Nq +i;
         const dfloat invJW = vgeo[gid + p_IJWID*p_Np];
 
-	// TW: CHECK SIGN ???
+        // TW: CHECK SIGN ???
         dfloat rhsq0 = s_H[0][j][i];
-	dfloat rhsq1 = s_H[1][j][i];
-	dfloat rhsq2 = s_H[2][j][i];
-	dfloat rhsq3 = s_H[3][j][i];
+        dfloat rhsq1 = s_H[1][j][i];
+        dfloat rhsq2 = s_H[2][j][i];
+        dfloat rhsq3 = s_H[3][j][i];
 
         for(int n=0;n<p_Nq;++n){
           const dfloat Din = s_D[n][i];
@@ -161,37 +160,37 @@ SOFTWARE.
           rhsq1 += Djn*s_G[1][n][i];
           rhsq2 += Din*s_F[2][j][n];
           rhsq2 += Djn*s_G[2][n][i];
-	  rhsq3 += Din*s_F[3][j][n];
-	  rhsq3 += Djn*s_G[3][n][i];
+          rhsq3 += Din*s_F[3][j][n];
+          rhsq3 += Djn*s_G[3][n][i];
         }
-        
-	// constrain momentum changes to lie on sphere
-	const dfloat xij = x[i+j*p_Nq+e*p_Np];
-	const dfloat yij = y[i+j*p_Nq+e*p_Np];
-	const dfloat zij = z[i+j*p_Nq+e*p_Np];
-	
-	// add coriolis force to momentum equation
-	rhsq1 -= p_fainv*zij*(yij*rw-zij*rv); // check if density should be here
-	rhsq2 -= p_fainv*zij*(zij*ru-xij*rw);
-	rhsq3 -= p_fainv*zij*(xij*rv-yij*ru);
+
+        // constrain momentum changes to lie on sphere
+        const dfloat xij = x[i+j*p_Nq+e*p_Np];
+        const dfloat yij = y[i+j*p_Nq+e*p_Np];
+        const dfloat zij = z[i+j*p_Nq+e*p_Np];
+
+        // add coriolis force to momentum equation
+        rhsq1 -= p_fainv*zij*(yij*rw-zij*rv); // check if density should be here
+        rhsq2 -= p_fainv*zij*(zij*ru-xij*rw);
+        rhsq3 -= p_fainv*zij*(xij*rv-yij*ru);
 
 #if 0
-	// remove radial component of momentum change
-	const dfloat xdotrhsq = (rhsq1*xij + rhsq2*yij + rhsq3*zij)*p_invRadiusSq;
-	
-	rhsq1 -= xij*xdotrhsq;
-	rhsq2 -= yij*xdotrhsq;
-	rhsq3 -= zij*xdotrhsq;
+        // remove radial component of momentum change
+        const dfloat xdotrhsq = (rhsq1*xij + rhsq2*yij + rhsq3*zij)*p_invRadiusSq;
+
+        rhsq1 -= xij*xdotrhsq;
+        rhsq2 -= yij*xdotrhsq;
+        rhsq3 -= zij*xdotrhsq;
 #endif
-	
+
         const dlong base = e*p_Np*p_Nfields + j*p_Nq + i;
-	
+
         // move to rhs . Why negative
         rhsq[base+0*p_Np] = -invJW*rhsq0;
         rhsq[base+1*p_Np] = -invJW*rhsq1;
         rhsq[base+2*p_Np] = -invJW*rhsq2;
-	rhsq[base+3*p_Np] = -invJW*rhsq3;
-        
+        rhsq[base+3*p_Np] = -invJW*rhsq3;
+
       }
     }
   }
@@ -204,37 +203,36 @@ SOFTWARE.
                                     const dfloat mu,
                                     @restrict const  dfloat *  q,
                                     @restrict dfloat *  viscousStresses){
-  
+
   for(dlong e=0;e<Nelements;++e;@outer(0)){
 
     @shared dfloat s_D[p_Nq][p_Nq];
     @shared dfloat s_u[p_Nq][p_Nq];
     @shared dfloat s_v[p_Nq][p_Nq];
     @shared dfloat s_w[p_Nq][p_Nq];
-    
+
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
-        
+
         s_D[j][i] = D[j*p_Nq+i];
 
         const dlong qbase = e*p_Nfields*p_Np + j*p_Nq + i;
         const dfloat r  = q[qbase + 0*p_Np];
         const dfloat ru = q[qbase + 1*p_Np];
         const dfloat rv = q[qbase + 2*p_Np];
-	const dfloat rw = q[qbase + 3*p_Np];
-        
+        const dfloat rw = q[qbase + 3*p_Np];
+
         s_u[j][i] = ru/r;
         s_v[j][i] = rv/r;
-	s_w[j][i] = rw/r;
-        
+        s_w[j][i] = rw/r;
+
       }
     }
-    
-    @barrier("local");
-    
+
+
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
-        
+
         dfloat dudr = 0, duds = 0, dvdr = 0, dvds = 0, dwdr = 0, dwds = 0;
 
         for(int n=0;n<p_Nq;++n){
@@ -247,51 +245,51 @@ SOFTWARE.
           dvdr += Din*s_v[j][n];
           dvds += Djn*s_v[n][i];
 
-	  dwdr += Din*s_w[j][n];
+          dwdr += Din*s_w[j][n];
           dwds += Djn*s_w[n][i];
         }
 
         const dlong gbase = e*p_Np*p_Nvgeo + j*p_Nq + i;
         const dfloat rx = vgeo[gbase+p_Np*p_RXID];
         const dfloat ry = vgeo[gbase+p_Np*p_RYID];
-	const dfloat rz = vgeo[gbase+p_Np*p_RZID];
+        const dfloat rz = vgeo[gbase+p_Np*p_RZID];
         const dfloat sx = vgeo[gbase+p_Np*p_SXID];
         const dfloat sy = vgeo[gbase+p_Np*p_SYID];
-	const dfloat sz = vgeo[gbase+p_Np*p_SZID];
-	const dfloat tx = vgeo[gbase+p_Np*p_TXID];
+        const dfloat sz = vgeo[gbase+p_Np*p_SZID];
+        const dfloat tx = vgeo[gbase+p_Np*p_TXID];
         const dfloat ty = vgeo[gbase+p_Np*p_TYID];
-	const dfloat tz = vgeo[gbase+p_Np*p_TZID];
+        const dfloat tz = vgeo[gbase+p_Np*p_TZID];
 
         const dfloat dudx = rx*dudr + sx*duds + tx*s_u[j][i];
         const dfloat dudy = ry*dudr + sy*duds + ty*s_u[j][i];
-	const dfloat dudz = rz*dudr + sz*duds + tz*s_u[j][i];
+        const dfloat dudz = rz*dudr + sz*duds + tz*s_u[j][i];
 
         const dfloat dvdx = rx*dvdr + sx*dvds + tx*s_v[j][i];
         const dfloat dvdy = ry*dvdr + sy*dvds + ty*s_v[j][i];
-	const dfloat dvdz = rz*dvdr + sz*dvds + tz*s_v[j][i];
+        const dfloat dvdz = rz*dvdr + sz*dvds + tz*s_v[j][i];
 
-	const dfloat dwdx = rx*dwdr + sx*dwds + tx*s_w[j][i];
+        const dfloat dwdx = rx*dwdr + sx*dwds + tx*s_w[j][i];
         const dfloat dwdy = ry*dwdr + sy*dwds + ty*s_w[j][i];
-	const dfloat dwdz = rz*dwdr + sz*dwds + tz*s_w[j][i];
-        
-	const dfloat divV = dudx + dvdy + dwdz;
+        const dfloat dwdz = rz*dwdr + sz*dwds + tz*s_w[j][i];
+
+        const dfloat divV = dudx + dvdy + dwdz;
 
         const dfloat S11 = p_half*(dudx+dudx) - p_third*divV;
-	const dfloat S22 = p_half*(dvdy+dvdy) - p_third*divV;
-	const dfloat S33 = p_half*(dwdz+dwdz) - p_third*divV;
-	
+        const dfloat S22 = p_half*(dvdy+dvdy) - p_third*divV;
+        const dfloat S33 = p_half*(dwdz+dwdz) - p_third*divV;
+
         const dfloat S12 = p_half*(dudy+dvdx);
-	const dfloat S13 = p_half*(dudz+dwdx);
-	const dfloat S23 = p_half*(dvdz+dwdy);
+        const dfloat S13 = p_half*(dudz+dwdx);
+        const dfloat S23 = p_half*(dvdz+dwdy);
 
         const dlong sbase = e*p_Nstresses*p_Np + j*p_Nq + i;
-	
+
         viscousStresses[sbase + 0*p_Np] = p_two*mu*S11;
         viscousStresses[sbase + 1*p_Np] = p_two*mu*S12;
-	viscousStresses[sbase + 2*p_Np] = p_two*mu*S13;
+        viscousStresses[sbase + 2*p_Np] = p_two*mu*S13;
         viscousStresses[sbase + 3*p_Np] = p_two*mu*S22;
-	viscousStresses[sbase + 4*p_Np] = p_two*mu*S23;
-	viscousStresses[sbase + 5*p_Np] = p_two*mu*S33;
+        viscousStresses[sbase + 4*p_Np] = p_two*mu*S23;
+        viscousStresses[sbase + 5*p_Np] = p_two*mu*S33;
       }
     }
   }
diff --git a/solvers/cns/okl/cnsVolumeTet3D.okl b/solvers/cns/okl/cnsVolumeTet3D.okl
index 166a3724a..30cfed6c4 100644
--- a/solvers/cns/okl/cnsVolumeTet3D.okl
+++ b/solvers/cns/okl/cnsVolumeTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -151,7 +151,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Np;++n;@inner(0)){
 
diff --git a/solvers/cns/okl/cnsVolumeTri2D.okl b/solvers/cns/okl/cnsVolumeTri2D.okl
index dad526ec2..20bfb42bc 100644
--- a/solvers/cns/okl/cnsVolumeTri2D.okl
+++ b/solvers/cns/okl/cnsVolumeTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -115,7 +115,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Np;++n;@inner(0)){
 
diff --git a/solvers/cns/okl/cnsVorticityHex3D.okl b/solvers/cns/okl/cnsVorticityHex3D.okl
index 3e37e6b2a..afa397a69 100644
--- a/solvers/cns/okl/cnsVorticityHex3D.okl
+++ b/solvers/cns/okl/cnsVorticityHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -44,26 +44,25 @@ SOFTWARE.
         for(int i=0;i<p_Nq;++i;@inner(0)){
           const dlong e = eo+es; // element in block
           if(e<Nelements){
-	    for(int k=0;k<p_Nq;++k){
-	      const dlong qbase = e*p_Nfields*p_Np + k*p_Nq*p_Nq + j*p_Nq +i;
-	      const dfloat r  = q[qbase + 0*p_Np];
-	      const dfloat ru = q[qbase + 1*p_Np];
-	      const dfloat rv = q[qbase + 2*p_Np];
-	      const dfloat rw = q[qbase + 3*p_Np];
-
-	      s_u[es][k][j][i] = ru/r;
-	      s_v[es][k][j][i] = rv/r;
-	      s_w[es][k][j][i] = rw/r;
-	    }
-	  }
-
-	  if (es==0)
-	    s_DT[j][i] = DT[j*p_Nq+i];
+            for(int k=0;k<p_Nq;++k){
+              const dlong qbase = e*p_Nfields*p_Np + k*p_Nq*p_Nq + j*p_Nq +i;
+              const dfloat r  = q[qbase + 0*p_Np];
+              const dfloat ru = q[qbase + 1*p_Np];
+              const dfloat rv = q[qbase + 2*p_Np];
+              const dfloat rw = q[qbase + 3*p_Np];
+
+              s_u[es][k][j][i] = ru/r;
+              s_v[es][k][j][i] = rv/r;
+              s_w[es][k][j][i] = rw/r;
+            }
+          }
+
+          if (es==0)
+            s_DT[j][i] = DT[j*p_Nq+i];
         }
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -71,58 +70,58 @@ SOFTWARE.
           const dlong e = eo+es; // element in block
           if(e<Nelements){
 
-	    for(int k=0;k<p_Nq;++k){
-
-	      const dlong gid = e*p_Np*p_Nvgeo+ k*p_Nq*p_Nq + j*p_Nq +i;
-
-	      const dfloat drdx = vgeo[gid + p_RXID*p_Np];
-	      const dfloat drdy = vgeo[gid + p_RYID*p_Np];
-	      const dfloat drdz = vgeo[gid + p_RZID*p_Np];
-	      const dfloat dsdx = vgeo[gid + p_SXID*p_Np];
-	      const dfloat dsdy = vgeo[gid + p_SYID*p_Np];
-	      const dfloat dsdz = vgeo[gid + p_SZID*p_Np];
-	      const dfloat dtdx = vgeo[gid + p_TXID*p_Np];
-	      const dfloat dtdy = vgeo[gid + p_TYID*p_Np];
-	      const dfloat dtdz = vgeo[gid + p_TZID*p_Np];
-
-	      // compute 1D derivatives
-	      dfloat ur = 0, vr = 0, wr = 0;
-	      dfloat us = 0, vs = 0, ws = 0;
-	      dfloat ut = 0, vt = 0, wt = 0;
-
-	      #pragma unroll p_Nq
-		for(int n=0;n<p_Nq;++n) {
-		  const dfloat Dr = s_DT[i][n];
-		  const dfloat Ds = s_DT[j][n];
-		  const dfloat Dt = s_DT[k][n];
-		  ur += Dr*s_u[es][k][j][n];
-		  us += Ds*s_u[es][k][n][i];
-		  ut += Dt*s_u[es][n][j][i];
-		  vr += Dr*s_v[es][k][j][n];
-		  vs += Ds*s_v[es][k][n][i];
-		  vt += Dt*s_v[es][n][j][i];
-		  wr += Dr*s_w[es][k][j][n];
-		  ws += Ds*s_w[es][k][n][i];
-		  wt += Dt*s_w[es][n][j][i];
-		}
-
-	      const dfloat uy = drdy*ur + dsdy*us + dtdy*ut;
-	      const dfloat uz = drdz*ur + dsdz*us + dtdz*ut;
-
-	      const dfloat vx = drdx*vr + dsdx*vs + dtdx*vt;
-	      const dfloat vz = drdz*vr + dsdz*vs + dtdz*vt;
-
-	      const dfloat wx = drdx*wr + dsdx*ws + dtdx*wt;
-	      const dfloat wy = drdy*wr + dsdy*ws + dtdy*wt;
-
-	      const dlong id = 3*e*p_Np+k*p_Nq*p_Nq + j*p_Nq+i;
-
-	      Vort[id+0*p_Np] = wy-vz;
-	      Vort[id+1*p_Np] = uz-wx;
-	      Vort[id+2*p_Np] = vx-uy;
-	    }
-	  }
-	}
+            for(int k=0;k<p_Nq;++k){
+
+              const dlong gid = e*p_Np*p_Nvgeo+ k*p_Nq*p_Nq + j*p_Nq +i;
+
+              const dfloat drdx = vgeo[gid + p_RXID*p_Np];
+              const dfloat drdy = vgeo[gid + p_RYID*p_Np];
+              const dfloat drdz = vgeo[gid + p_RZID*p_Np];
+              const dfloat dsdx = vgeo[gid + p_SXID*p_Np];
+              const dfloat dsdy = vgeo[gid + p_SYID*p_Np];
+              const dfloat dsdz = vgeo[gid + p_SZID*p_Np];
+              const dfloat dtdx = vgeo[gid + p_TXID*p_Np];
+              const dfloat dtdy = vgeo[gid + p_TYID*p_Np];
+              const dfloat dtdz = vgeo[gid + p_TZID*p_Np];
+
+              // compute 1D derivatives
+              dfloat ur = 0, vr = 0, wr = 0;
+              dfloat us = 0, vs = 0, ws = 0;
+              dfloat ut = 0, vt = 0, wt = 0;
+
+              #pragma unroll p_Nq
+                for(int n=0;n<p_Nq;++n) {
+                  const dfloat Dr = s_DT[i][n];
+                  const dfloat Ds = s_DT[j][n];
+                  const dfloat Dt = s_DT[k][n];
+                  ur += Dr*s_u[es][k][j][n];
+                  us += Ds*s_u[es][k][n][i];
+                  ut += Dt*s_u[es][n][j][i];
+                  vr += Dr*s_v[es][k][j][n];
+                  vs += Ds*s_v[es][k][n][i];
+                  vt += Dt*s_v[es][n][j][i];
+                  wr += Dr*s_w[es][k][j][n];
+                  ws += Ds*s_w[es][k][n][i];
+                  wt += Dt*s_w[es][n][j][i];
+                }
+
+              const dfloat uy = drdy*ur + dsdy*us + dtdy*ut;
+              const dfloat uz = drdz*ur + dsdz*us + dtdz*ut;
+
+              const dfloat vx = drdx*vr + dsdx*vs + dtdx*vt;
+              const dfloat vz = drdz*vr + dsdz*vs + dtdz*vt;
+
+              const dfloat wx = drdx*wr + dsdx*ws + dtdx*wt;
+              const dfloat wy = drdy*wr + dsdy*ws + dtdy*wt;
+
+              const dlong id = 3*e*p_Np+k*p_Nq*p_Nq + j*p_Nq+i;
+
+              Vort[id+0*p_Np] = wy-vz;
+              Vort[id+1*p_Np] = uz-wx;
+              Vort[id+2*p_Np] = vx-uy;
+            }
+          }
+        }
       }
     }
   }
diff --git a/solvers/cns/okl/cnsVorticityQuad2D.okl b/solvers/cns/okl/cnsVorticityQuad2D.okl
index 680f022aa..bc7b68764 100644
--- a/solvers/cns/okl/cnsVorticityQuad2D.okl
+++ b/solvers/cns/okl/cnsVorticityQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -57,7 +57,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -94,4 +93,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/cns/okl/cnsVorticityQuad3D.okl b/solvers/cns/okl/cnsVorticityQuad3D.okl
index 580b1a9fa..f3e85c3bf 100644
--- a/solvers/cns/okl/cnsVorticityQuad3D.okl
+++ b/solvers/cns/okl/cnsVorticityQuad3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,33 +25,33 @@ SOFTWARE.
 */
 
 @kernel void cnsVorticityQuad3D(const dlong Nelements,
-				@restrict const  dfloat *  vgeo,
-				@restrict const  dfloat *  D,
-				@restrict const  dfloat *  q,
-				@restrict dfloat *  Vort){  
-  
+                                @restrict const  dfloat *  vgeo,
+                                @restrict const  dfloat *  D,
+                                @restrict const  dfloat *  q,
+                                @restrict dfloat *  Vort){
+
   // block partition of elements
   for(dlong eo=0;eo<Nelements;eo+=p_NblockV;@outer(0)){
-    
+
     @shared dfloat s_u[p_NblockV][p_Nq][p_Nq];
     @shared dfloat s_v[p_NblockV][p_Nq][p_Nq];
     @shared dfloat s_w[p_NblockV][p_Nq][p_Nq];
     @shared dfloat s_D[p_Nq][p_Nq];
 
-    for(int es=0;es<p_NblockV;++es;@inner(2)){   
-      for(int j=0;j<p_Nq;++j;@inner(1)){ 
-        for(int i=0;i<p_Nq;++i;@inner(0)){    
+    for(int es=0;es<p_NblockV;++es;@inner(2)){
+      for(int j=0;j<p_Nq;++j;@inner(1)){
+        for(int i=0;i<p_Nq;++i;@inner(0)){
           const dlong e = eo+es; // element in block
-          if(e<Nelements){ 
+          if(e<Nelements){
             const dlong qbase = e*p_Nfields*p_Np + j*p_Nq +i;
             const dfloat r  = q[qbase + 0*p_Np];
             const dfloat ru = q[qbase + 1*p_Np];
             const dfloat rv = q[qbase + 2*p_Np];
-	    const dfloat rw = q[qbase + 3*p_Np];
-            
+            const dfloat rw = q[qbase + 3*p_Np];
+
             s_u[es][j][i] = ru/r;
             s_v[es][j][i] = rv/r;
-	    s_w[es][j][i] = rw/r;
+            s_w[es][j][i] = rw/r;
 
             if (es==0)
               s_D[j][i] = D[j*p_Nq+i];
@@ -60,23 +60,22 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
-    for(int es=0;es<p_NblockV;++es;@inner(2)){   
-      for(int j=0;j<p_Nq;++j;@inner(1)){ 
-        for(int i=0;i<p_Nq;++i;@inner(0)){    
+    for(int es=0;es<p_NblockV;++es;@inner(2)){
+      for(int j=0;j<p_Nq;++j;@inner(1)){
+        for(int i=0;i<p_Nq;++i;@inner(0)){
           const dlong e = eo+es; // element in block
-          if(e<Nelements){ 
+          if(e<Nelements){
             const dlong gid = e*p_Np*p_Nvgeo+ j*p_Nq +i;
             const dfloat drdx = vgeo[gid + p_RXID*p_Np];
             const dfloat drdy = vgeo[gid + p_RYID*p_Np];
-	    const dfloat drdz = vgeo[gid + p_RZID*p_Np];
+            const dfloat drdz = vgeo[gid + p_RZID*p_Np];
             const dfloat dsdx = vgeo[gid + p_SXID*p_Np];
             const dfloat dsdy = vgeo[gid + p_SYID*p_Np];
-	    const dfloat dsdz = vgeo[gid + p_SZID*p_Np];
-	    const dfloat dtdx = vgeo[gid + p_TXID*p_Np];
-	    const dfloat dtdy = vgeo[gid + p_TYID*p_Np];
-	    const dfloat dtdz = vgeo[gid + p_TZID*p_Np];
+            const dfloat dsdz = vgeo[gid + p_SZID*p_Np];
+            const dfloat dtdx = vgeo[gid + p_TXID*p_Np];
+            const dfloat dtdy = vgeo[gid + p_TYID*p_Np];
+            const dfloat dtdz = vgeo[gid + p_TZID*p_Np];
 
             // compute 1D derivatives
             dfloat ur = 0, vr = 0, wr = 0;
@@ -90,23 +89,23 @@ SOFTWARE.
                 us += Ds*s_u[es][n][i];
                 vr += Dr*s_v[es][j][n];
                 vs += Ds*s_v[es][n][i];
-		wr += Dr*s_w[es][j][n];
+                wr += Dr*s_w[es][j][n];
                 ws += Ds*s_w[es][n][i];
               }
 
             const dfloat uy = drdy*ur + dsdy*us;// + dtdy*s_u[es][j][i];
-	    const dfloat uz = drdz*ur + dsdz*us;// + dtdz*s_u[es][j][i];
+            const dfloat uz = drdz*ur + dsdz*us;// + dtdz*s_u[es][j][i];
 
-	    const dfloat vx = drdx*vr + dsdx*vs;// + dtdx*s_v[es][j][i];
-	    const dfloat vz = drdz*vr + dsdz*vs;// + dtdz*s_v[es][j][i];
+            const dfloat vx = drdx*vr + dsdx*vs;// + dtdx*s_v[es][j][i];
+            const dfloat vz = drdz*vr + dsdz*vs;// + dtdz*s_v[es][j][i];
 
-	    const dfloat wx = drdx*wr + dsdx*ws;// + dtdx*s_w[es][j][i];
-	    const dfloat wy = drdy*wr + dsdy*ws;// + dtdy*s_w[es][j][i];
+            const dfloat wx = drdx*wr + dsdx*ws;// + dtdx*s_w[es][j][i];
+            const dfloat wy = drdy*wr + dsdy*ws;// + dtdy*s_w[es][j][i];
 
-            const dlong id = e*p_Np*3+j*p_Nq+i; 
+            const dlong id = e*p_Np*3+j*p_Nq+i;
             Vort[id+0*p_Np] = wy-vz;
-	    Vort[id+1*p_Np] = uz-wx;
-	    Vort[id+2*p_Np] = vx-uy;
+            Vort[id+1*p_Np] = uz-wx;
+            Vort[id+2*p_Np] = vx-uy;
           }
         }
       }
diff --git a/solvers/cns/okl/cnsVorticityTet3D.okl b/solvers/cns/okl/cnsVorticityTet3D.okl
index cbf02cfc9..a4dd29c46 100644
--- a/solvers/cns/okl/cnsVorticityTet3D.okl
+++ b/solvers/cns/okl/cnsVorticityTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -53,7 +53,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(dlong e=eo;e<eo+p_NblockV;++e;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
diff --git a/solvers/cns/okl/cnsVorticityTri2D.okl b/solvers/cns/okl/cnsVorticityTri2D.okl
index cb921d8e8..4959db696 100644
--- a/solvers/cns/okl/cnsVorticityTri2D.okl
+++ b/solvers/cns/okl/cnsVorticityTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -50,7 +50,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(dlong e=eo;e<eo+p_NblockV;++e;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
diff --git a/solvers/cns/src/cnsPlotFields.cpp b/solvers/cns/src/cnsPlotFields.cpp
index 0b5d40252..acdc4cddb 100644
--- a/solvers/cns/src/cnsPlotFields.cpp
+++ b/solvers/cns/src/cnsPlotFields.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,11 +27,11 @@ SOFTWARE.
 #include "cns.hpp"
 
 // interpolate data to plot nodes and save to file (one per process)
-void cns_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){
+void cns_t::PlotFields(memory<dfloat> Q, memory<dfloat> V, std::string fileName){
 
   FILE *fp;
 
-  fp = fopen(fileName, "w");
+  fp = fopen(fileName.c_str(), "w");
 
   fprintf(fp, "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"BigEndian\">\n");
   fprintf(fp, "  <UnstructuredGrid>\n");
@@ -44,40 +44,46 @@ void cns_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){
   fprintf(fp, "        <DataArray type=\"Float32\" NumberOfComponents=\"3\" Format=\"ascii\">\n");
 
   //scratch space for interpolation
-  size_t NscratchBytes = mymax(mesh.Np, mesh.plotNp)*sizeof(dfloat);
-  dfloat* scratch = (dfloat *) malloc(2*NscratchBytes);
+  size_t Nscratch = std::max(mesh.Np, mesh.plotNp);
+  memory<dfloat> scratch(2*Nscratch);
 
-  dfloat* Ix = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iy = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iz = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
+  memory<dfloat> Ix(mesh.plotNp);
+  memory<dfloat> Iy(mesh.plotNp);
+  memory<dfloat> Iz(mesh.plotNp);
 
   // compute plot node coordinates on the fly
   for(dlong e=0;e<mesh.Nelements;++e){
     mesh.PlotInterp(mesh.x + e*mesh.Np, Ix, scratch);
     mesh.PlotInterp(mesh.y + e*mesh.Np, Iy, scratch);
-    mesh.PlotInterp(mesh.z + e*mesh.Np, Iz, scratch);
+    if(mesh.dim==3)
+      mesh.PlotInterp(mesh.z + e*mesh.Np, Iz, scratch);
 
-    for(int n=0;n<mesh.plotNp;++n){
-      fprintf(fp, "       ");
-      fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],Iz[n]);
+    if (mesh.dim==2) {
+      for(int n=0;n<mesh.plotNp;++n){
+        fprintf(fp, "       ");
+        fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],0.0);
+      }
+    } else {
+      for(int n=0;n<mesh.plotNp;++n){
+        fprintf(fp, "       ");
+        fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],Iz[n]);
+      }
     }
   }
   fprintf(fp, "        </DataArray>\n");
   fprintf(fp, "      </Points>\n");
 
-  free(Ix); free(Iy); free(Iz);
-
-  dfloat* p = (dfloat *) malloc(mesh.Np*sizeof(dfloat));
-  dfloat* u = (dfloat *) malloc(mesh.Np*sizeof(dfloat));
-  dfloat* v = (dfloat *) malloc(mesh.Np*sizeof(dfloat));
-  dfloat* w = (dfloat *) malloc(mesh.Np*sizeof(dfloat));
+  memory<dfloat> p(mesh.Np);
+  memory<dfloat> u(mesh.Np);
+  memory<dfloat> v(mesh.Np);
+  memory<dfloat> w(mesh.Np);
 
-  dfloat* Ip = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iu = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iv = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iw = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
+  memory<dfloat> Ip(mesh.plotNp);
+  memory<dfloat> Iu(mesh.plotNp);
+  memory<dfloat> Iv(mesh.plotNp);
+  memory<dfloat> Iw(mesh.plotNp);
 
-  if (Q!=NULL) {
+  if (Q.length()!=0) {
     // write out density
     fprintf(fp, "      <PointData Scalars=\"scalars\">\n");
     fprintf(fp, "        <DataArray type=\"Float32\" Name=\"Density\" Format=\"ascii\">\n");
@@ -144,7 +150,7 @@ void cns_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){
     }
   }
 
-  if (V!=NULL) {
+  if (V.length()!=0) {
     // write out vorticity
     if(mesh.dim==2){
       fprintf(fp, "        <DataArray type=\"Float32\" Name=\"Vorticity\" Format=\"ascii\">\n");
@@ -175,9 +181,6 @@ void cns_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){
 
   fprintf(fp, "     </PointData>\n");
 
-  free(p); free(u); free(v); free(w);
-  free(Ip); free(Iu); free(Iv); free(Iw);
-
   fprintf(fp, "    <Cells>\n");
   fprintf(fp, "      <DataArray type=\"Int32\" Name=\"connectivity\" Format=\"ascii\">\n");
 
@@ -218,6 +221,4 @@ void cns_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){
   fprintf(fp, "  </UnstructuredGrid>\n");
   fprintf(fp, "</VTKFile>\n");
   fclose(fp);
-
-  free(scratch);
 }
diff --git a/solvers/cns/src/cnsReport.cpp b/solvers/cns/src/cnsReport.cpp
index b196b20af..474fb69aa 100644
--- a/solvers/cns/src/cnsReport.cpp
+++ b/solvers/cns/src/cnsReport.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -37,7 +37,7 @@ void cns_t::Report(dfloat time, int tstep){
   mesh.MassMatrixApply(o_q, o_Mq);
 
   dlong Nentries = mesh.Nelements*mesh.Np*Nfields;
-  dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_q, o_Mq, mesh.comm));
+  dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_q, o_Mq, mesh.comm));
 
   if(mesh.rank==0)
     printf("%5.2f (%d), %5.2f (time, timestep, norm)\n", time, tstep, norm2);
@@ -49,11 +49,11 @@ void cns_t::Report(dfloat time, int tstep){
     o_Vort.copyTo(Vort);
 
     // output field files
-    string name;
+    std::string name;
     settings.getSetting("OUTPUT FILE NAME", name);
     char fname[BUFSIZ];
     sprintf(fname, "%s_%04d_%04d.vtu", name.c_str(), mesh.rank, frame++);
 
-    PlotFields(q, Vort, fname);
+    PlotFields(q, Vort, std::string(fname));
   }
 }
diff --git a/solvers/cns/src/cnsRun.cpp b/solvers/cns/src/cnsRun.cpp
index b7e70dae3..5fa52e93d 100644
--- a/solvers/cns/src/cnsRun.cpp
+++ b/solvers/cns/src/cnsRun.cpp
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -51,10 +51,10 @@ void cns_t::Run(){
   dfloat dtAdv  = cfl/(vmax*(mesh.N+1.)*(mesh.N+1.));
   dfloat dtVisc = cfl*pow(hmin, 2)/(pow(mesh.N+1,4)*mu);
 
-  dfloat dt = mymin(dtAdv, dtVisc);
-  timeStepper->SetTimeStep(dt);
+  dfloat dt = std::min(dtAdv, dtVisc);
+  timeStepper.SetTimeStep(dt);
 
-  timeStepper->Run(o_q, startTime, finalTime);
+  timeStepper.Run(*this, o_q, startTime, finalTime);
 
   // output norm of final solution
   {
@@ -62,7 +62,7 @@ void cns_t::Run(){
     mesh.MassMatrixApply(o_q, o_Mq);
 
     dlong Nentries = mesh.Nelements*mesh.Np*Nfields;
-    dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_q, o_Mq, mesh.comm));
+    dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_q, o_Mq, mesh.comm));
 
     if(mesh.rank==0)
       printf("Solution norm = %17.15lg\n", norm2);
diff --git a/solvers/cns/src/cnsSettings.cpp b/solvers/cns/src/cnsSettings.cpp
index 1d239a867..628d79f22 100644
--- a/solvers/cns/src/cnsSettings.cpp
+++ b/solvers/cns/src/cnsSettings.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,7 +27,7 @@ SOFTWARE.
 #include "cns.hpp"
 
 //settings for cns solver
-cnsSettings_t::cnsSettings_t(MPI_Comm& _comm):
+cnsSettings_t::cnsSettings_t(comm_t _comm):
   settings_t(_comm) {
 
   newSetting("DATA FILE",
@@ -84,10 +84,7 @@ cnsSettings_t::cnsSettings_t(MPI_Comm& _comm):
 
 void cnsSettings_t::report() {
 
-  int rank;
-  MPI_Comm_rank(comm, &rank);
-
-  if (rank==0) {
+  if (comm.rank()==0) {
     std::cout << "CNS Settings:\n\n";
 
     reportSetting("DATA FILE");
@@ -106,15 +103,15 @@ void cnsSettings_t::report() {
 
 void cnsSettings_t::parseFromFile(platformSettings_t& platformSettings,
                                   meshSettings_t& meshSettings,
-                                  const string filename) {
+                                  const std::string filename) {
   //read all settings from file
   settings_t s(comm);
   s.readSettingsFromFile(filename);
 
   for(auto it = s.settings.begin(); it != s.settings.end(); ++it) {
-    setting_t* set = it->second;
-    const string name = set->getName();
-    const string val = set->getVal<string>();
+    setting_t& set = it->second;
+    const std::string name = set.getName();
+    const std::string val = set.getVal<std::string>();
     if (platformSettings.hasSetting(name))
       platformSettings.changeSetting(name, val);
     else if (meshSettings.hasSetting(name))
@@ -122,9 +119,7 @@ void cnsSettings_t::parseFromFile(platformSettings_t& platformSettings,
     else if (hasSetting(name)) //self
       changeSetting(name, val);
     else  {
-      stringstream ss;
-      ss << "Unknown setting: [" << name << "] requested";
-      LIBP_ABORT(ss.str());
+      LIBP_FORCE_ABORT("Unknown setting: [" << name << "] requested");
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/cns/src/cnsSetup.cpp b/solvers/cns/src/cnsSetup.cpp
index a7254037b..9d7d3a8d8 100644
--- a/solvers/cns/src/cnsSetup.cpp
+++ b/solvers/cns/src/cnsSetup.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,243 +26,230 @@ SOFTWARE.
 
 #include "cns.hpp"
 
-cns_t& cns_t::Setup(platform_t& platform, mesh_t& mesh,
-                    cnsSettings_t& settings){
+void cns_t::Setup(platform_t& _platform, mesh_t& _mesh,
+                  cnsSettings_t& _settings){
 
-  cns_t* cns = new cns_t(platform, mesh, settings);
+  platform = _platform;
+  mesh = _mesh;
+  comm = _mesh.comm;
+  settings = _settings;
+
+  //Trigger JIT kernel builds
+  ogs::InitializeKernels(platform, ogs::Dfloat, ogs::Add);
 
   //get physical paramters
-  settings.getSetting("VISCOSITY", cns->mu);
-  settings.getSetting("GAMMA", cns->gamma);
+  settings.getSetting("VISCOSITY", mu);
+  settings.getSetting("GAMMA", gamma);
 
-  cns->cubature   = (settings.compareSetting("ADVECTION TYPE", "CUBATURE")) ? 1:0;
-  cns->isothermal = (settings.compareSetting("ISOTHERMAL", "TRUE")) ? 1:0;
+  cubature   = (settings.compareSetting("ADVECTION TYPE", "CUBATURE")) ? 1:0;
+  isothermal = (settings.compareSetting("ISOTHERMAL", "TRUE")) ? 1:0;
 
   //setup cubature
-  if (cns->cubature) {
+  if (cubature) {
     mesh.CubatureSetup();
-    mesh.CubatureNodes();
+    mesh.CubaturePhysicalNodes();
   }
 
-  cns->Nfields   = (mesh.dim==3) ? 4:3;
-  cns->Ngrads = mesh.dim*mesh.dim;
+  Nfields   = (mesh.dim==3) ? 4:3;
+  Ngrads = mesh.dim*mesh.dim;
 
-  if (!cns->isothermal) cns->Nfields++; //include energy equation
+  if (!isothermal) Nfields++; //include energy equation
 
-  dlong NlocalFields = mesh.Nelements*mesh.Np*cns->Nfields;
-  dlong NhaloFields  = mesh.totalHaloPairs*mesh.Np*cns->Nfields;
-  dlong NlocalGrads = mesh.Nelements*mesh.Np*cns->Ngrads;
-  dlong NhaloGrads  = mesh.totalHaloPairs*mesh.Np*cns->Ngrads;
+  dlong NlocalFields = mesh.Nelements*mesh.Np*Nfields;
+  dlong NhaloFields  = mesh.totalHaloPairs*mesh.Np*Nfields;
+  dlong NlocalGrads = mesh.Nelements*mesh.Np*Ngrads;
+  dlong NhaloGrads  = mesh.totalHaloPairs*mesh.Np*Ngrads;
 
   //setup timeStepper
   if (settings.compareSetting("TIME INTEGRATOR","AB3")){
-    cns->timeStepper = new TimeStepper::ab3(mesh.Nelements, mesh.totalHaloPairs,
-                                              mesh.Np, cns->Nfields, *cns);
+    timeStepper.Setup<TimeStepper::ab3>(mesh.Nelements,
+                                        mesh.totalHaloPairs,
+                                        mesh.Np, Nfields, platform, comm);
   } else if (settings.compareSetting("TIME INTEGRATOR","LSERK4")){
-    cns->timeStepper = new TimeStepper::lserk4(mesh.Nelements, mesh.totalHaloPairs,
-                                              mesh.Np, cns->Nfields, *cns);
+    timeStepper.Setup<TimeStepper::lserk4>(mesh.Nelements,
+                                           mesh.totalHaloPairs,
+                                           mesh.Np, Nfields, platform, comm);
   } else if (settings.compareSetting("TIME INTEGRATOR","DOPRI5")){
-    cns->timeStepper = new TimeStepper::dopri5(mesh.Nelements, mesh.totalHaloPairs,
-                                              mesh.Np, cns->Nfields, *cns, mesh.comm);
+    timeStepper.Setup<TimeStepper::dopri5>(mesh.Nelements,
+                                           mesh.totalHaloPairs,
+                                           mesh.Np, Nfields, platform, comm);
   }
 
   //setup linear algebra module
-  platform.linAlg.InitKernels({"innerProd", "max"});
+  platform.linAlg().InitKernels({"innerProd", "max"});
 
   /*setup trace halo exchange */
-  cns->fieldTraceHalo = mesh.HaloTraceSetup(cns->Nfields);
-  cns->gradTraceHalo  = mesh.HaloTraceSetup(cns->Ngrads);
+  fieldTraceHalo = mesh.HaloTraceSetup(Nfields);
+  gradTraceHalo  = mesh.HaloTraceSetup(Ngrads);
 
   // compute samples of q at interpolation nodes
-  cns->q = (dfloat*) calloc(NlocalFields+NhaloFields, sizeof(dfloat));
-  cns->o_q = platform.malloc((NlocalFields+NhaloFields)*sizeof(dfloat),
-                                              cns->q);
+  q.malloc(NlocalFields+NhaloFields);
+  o_q = platform.malloc<dfloat>(q);
 
-  cns->gradq = (dfloat*) calloc(NlocalGrads+NhaloGrads, sizeof(dfloat));
-  cns->o_gradq = platform.malloc((NlocalGrads+NhaloGrads)*sizeof(dfloat),
-                                              cns->gradq);
+  gradq.malloc(NlocalGrads+NhaloGrads);
+  o_gradq = platform.malloc<dfloat>(gradq);
 
-  cns->Vort = (dfloat*) calloc(mesh.dim*mesh.Nelements*mesh.Np, sizeof(dfloat));
-  cns->o_Vort = platform.malloc((mesh.dim*mesh.Nelements*mesh.Np)*sizeof(dfloat),
-                                              cns->Vort);
+  Vort.malloc(mesh.dim*mesh.Nelements*mesh.Np);
+  o_Vort = platform.malloc<dfloat>(Vort);
 
   //storage for M*q during reporting
-  cns->o_Mq = platform.malloc((NlocalFields+NhaloFields)*sizeof(dfloat), cns->q);
-  mesh.MassMatrixKernelSetup(cns->Nfields); // mass matrix operator
+  o_Mq = platform.malloc<dfloat>(q);
+  mesh.MassMatrixKernelSetup(Nfields); // mass matrix operator
 
   // OCCA build stuff
-  occa::properties kernelInfo = mesh.props; //copy base occa properties
+  properties_t kernelInfo = mesh.props; //copy base occa properties
 
   //add boundary data to kernel info
-  string dataFileName;
+  std::string dataFileName;
   settings.getSetting("DATA FILE", dataFileName);
   kernelInfo["includes"] += dataFileName;
 
-  kernelInfo["defines/" "p_Nfields"]= cns->Nfields;
-  kernelInfo["defines/" "p_Ngrads"]= cns->Ngrads;
+  kernelInfo["defines/" "p_Nfields"]= Nfields;
+  kernelInfo["defines/" "p_Ngrads"]= Ngrads;
 
-  int maxNodes = mymax(mesh.Np, (mesh.Nfp*mesh.Nfaces));
+  int maxNodes = std::max(mesh.Np, (mesh.Nfp*mesh.Nfaces));
   kernelInfo["defines/" "p_maxNodes"]= maxNodes;
 
   int blockMax = 256;
   if (platform.device.mode() == "CUDA") blockMax = 512;
 
-  int NblockV = mymax(1, blockMax/mesh.Np);
+  int NblockV = std::max(1, blockMax/mesh.Np);
   kernelInfo["defines/" "p_NblockV"]= NblockV;
 
-  int NblockS = mymax(1, blockMax/maxNodes);
+  int NblockS = std::max(1, blockMax/maxNodes);
   kernelInfo["defines/" "p_NblockS"]= NblockS;
 
-  if (cns->cubature) {
-    int cubMaxNodes = mymax(mesh.Np, (mesh.intNfp*mesh.Nfaces));
+  if (cubature) {
+    int cubMaxNodes = std::max(mesh.Np, (mesh.intNfp*mesh.Nfaces));
     kernelInfo["defines/" "p_cubMaxNodes"]= cubMaxNodes;
-    int cubMaxNodes1 = mymax(mesh.Np, (mesh.intNfp));
+    int cubMaxNodes1 = std::max(mesh.Np, (mesh.intNfp));
     kernelInfo["defines/" "p_cubMaxNodes1"]= cubMaxNodes1;
 
-    int cubNblockV = mymax(1, blockMax/mesh.cubNp);
+    int cubNblockV = std::max(1, blockMax/mesh.cubNp);
     kernelInfo["defines/" "p_cubNblockV"]= cubNblockV;
 
-    int cubNblockS = mymax(1, blockMax/cubMaxNodes);
+    int cubNblockS = std::max(1, blockMax/cubMaxNodes);
     kernelInfo["defines/" "p_cubNblockS"]= cubNblockS;
   }
 
-  kernelInfo["parser/" "automate-add-barriers"] =  "disabled";
-
   // set kernel name suffix
-  char *suffix;
-  if(mesh.elementType==TRIANGLES)
-    suffix = strdup("Tri2D");
-  if(mesh.elementType==QUADRILATERALS)
-    suffix = strdup("Quad2D");
-  if(mesh.elementType==TETRAHEDRA)
-    suffix = strdup("Tet3D");
-  if(mesh.elementType==HEXAHEDRA)
-    suffix = strdup("Hex3D");
-
-  char fileName[BUFSIZ], kernelName[BUFSIZ];
-
-  if (cns->isothermal) {
-    if (cns->cubature) {
+  std::string suffix;
+  if(mesh.elementType==Mesh::TRIANGLES)
+    suffix = "Tri2D";
+  if(mesh.elementType==Mesh::QUADRILATERALS)
+    suffix = "Quad2D";
+  if(mesh.elementType==Mesh::TETRAHEDRA)
+    suffix = "Tet3D";
+  if(mesh.elementType==Mesh::HEXAHEDRA)
+    suffix = "Hex3D";
+
+  std::string oklFilePrefix = DCNS "/okl/";
+  std::string oklFileSuffix = ".okl";
+
+  std::string fileName, kernelName;
+
+  if (isothermal) {
+    if (cubature) {
       // kernels from volume file
-      sprintf(fileName, DCNS "/okl/cnsIsothermalCubatureVolume%s.okl", suffix);
-      sprintf(kernelName, "cnsIsothermalCubatureVolume%s", suffix);
+      fileName   = oklFilePrefix + "cnsIsothermalCubatureVolume" + suffix + oklFileSuffix;
+      kernelName = "cnsIsothermalCubatureVolume" + suffix;
 
-      cns->cubatureVolumeKernel =  platform.buildKernel(fileName, kernelName,
+      cubatureVolumeKernel =  platform.buildKernel(fileName, kernelName,
                                                kernelInfo);
       // kernels from surface file
-      sprintf(fileName, DCNS "/okl/cnsIsothermalCubatureSurface%s.okl", suffix);
-      sprintf(kernelName, "cnsIsothermalCubatureSurface%s", suffix);
+      fileName   = oklFilePrefix + "cnsIsothermalCubatureSurface" + suffix + oklFileSuffix;
+      kernelName = "cnsIsothermalCubatureSurface" + suffix;
 
-      cns->cubatureSurfaceKernel = platform.buildKernel(fileName, kernelName,
+      cubatureSurfaceKernel = platform.buildKernel(fileName, kernelName,
                                                kernelInfo);
     } else {
       // kernels from volume file
-      sprintf(fileName, DCNS "/okl/cnsIsothermalVolume%s.okl", suffix);
-      sprintf(kernelName, "cnsIsothermalVolume%s", suffix);
+      fileName   = oklFilePrefix + "cnsIsothermalVolume" + suffix + oklFileSuffix;
+      kernelName = "cnsIsothermalVolume" + suffix;
 
-      cns->volumeKernel =  platform.buildKernel(fileName, kernelName,
+      volumeKernel =  platform.buildKernel(fileName, kernelName,
                                              kernelInfo);
       // kernels from surface file
-      sprintf(fileName, DCNS "/okl/cnsIsothermalSurface%s.okl", suffix);
-      sprintf(kernelName, "cnsIsothermalSurface%s", suffix);
+      fileName   = oklFilePrefix + "cnsIsothermalSurface" + suffix + oklFileSuffix;
+      kernelName = "cnsIsothermalSurface" + suffix;
 
-      cns->surfaceKernel = platform.buildKernel(fileName, kernelName,
+      surfaceKernel = platform.buildKernel(fileName, kernelName,
                                              kernelInfo);
     }
   } else {
-    if (cns->cubature) {
+    if (cubature) {
       // kernels from volume file
-      sprintf(fileName, DCNS "/okl/cnsCubatureVolume%s.okl", suffix);
-      sprintf(kernelName, "cnsCubatureVolume%s", suffix);
+      fileName   = oklFilePrefix + "cnsCubatureVolume" + suffix + oklFileSuffix;
+      kernelName = "cnsCubatureVolume" + suffix;
 
-      cns->cubatureVolumeKernel =  platform.buildKernel(fileName, kernelName,
+      cubatureVolumeKernel =  platform.buildKernel(fileName, kernelName,
                                                kernelInfo);
       // kernels from surface file
-      sprintf(fileName, DCNS "/okl/cnsCubatureSurface%s.okl", suffix);
-      sprintf(kernelName, "cnsCubatureSurface%s", suffix);
+      fileName   = oklFilePrefix + "cnsCubatureSurface" + suffix + oklFileSuffix;
+      kernelName = "cnsCubatureSurface" + suffix;
 
-      cns->cubatureSurfaceKernel = platform.buildKernel(fileName, kernelName,
+      cubatureSurfaceKernel = platform.buildKernel(fileName, kernelName,
                                                kernelInfo);
     } else {
       // kernels from volume file
-      sprintf(fileName, DCNS "/okl/cnsVolume%s.okl", suffix);
-      sprintf(kernelName, "cnsVolume%s", suffix);
+      fileName   = oklFilePrefix + "cnsVolume" + suffix + oklFileSuffix;
+      kernelName = "cnsVolume" + suffix;
 
-      cns->volumeKernel =  platform.buildKernel(fileName, kernelName,
+      volumeKernel =  platform.buildKernel(fileName, kernelName,
                                              kernelInfo);
       // kernels from surface file
-      sprintf(fileName, DCNS "/okl/cnsSurface%s.okl", suffix);
-      sprintf(kernelName, "cnsSurface%s", suffix);
+      fileName   = oklFilePrefix + "cnsSurface" + suffix + oklFileSuffix;
+      kernelName = "cnsSurface" + suffix;
 
-      cns->surfaceKernel = platform.buildKernel(fileName, kernelName,
+      surfaceKernel = platform.buildKernel(fileName, kernelName,
                                              kernelInfo);
     }
   }
 
   // kernels from volume file
-  sprintf(fileName, DCNS "/okl/cnsGradVolume%s.okl", suffix);
-  sprintf(kernelName, "cnsGradVolume%s", suffix);
+  fileName   = oklFilePrefix + "cnsGradVolume" + suffix + oklFileSuffix;
+  kernelName = "cnsGradVolume" + suffix;
 
-  cns->gradVolumeKernel =  platform.buildKernel(fileName, kernelName,
+  gradVolumeKernel =  platform.buildKernel(fileName, kernelName,
                                            kernelInfo);
   // kernels from surface file
-  sprintf(fileName, DCNS "/okl/cnsGradSurface%s.okl", suffix);
-  sprintf(kernelName, "cnsGradSurface%s", suffix);
+  fileName   = oklFilePrefix + "cnsGradSurface" + suffix + oklFileSuffix;
+  kernelName = "cnsGradSurface" + suffix;
 
-  cns->gradSurfaceKernel = platform.buildKernel(fileName, kernelName,
+  gradSurfaceKernel = platform.buildKernel(fileName, kernelName,
                                            kernelInfo);
 
   // vorticity calculation
-  sprintf(fileName, DCNS "/okl/cnsVorticity%s.okl", suffix);
-  sprintf(kernelName, "cnsVorticity%s", suffix);
+  fileName   = oklFilePrefix + "cnsVorticity" + suffix + oklFileSuffix;
+  kernelName = "cnsVorticity" + suffix;
 
-  cns->vorticityKernel = platform.buildKernel(fileName, kernelName,
+  vorticityKernel = platform.buildKernel(fileName, kernelName,
                                      kernelInfo);
 
   if (mesh.dim==2) {
-    sprintf(fileName, DCNS "/okl/cnsInitialCondition2D.okl");
-    if (cns->isothermal)
-      sprintf(kernelName, "cnsIsothermalInitialCondition2D");
+    fileName   = oklFilePrefix + "cnsInitialCondition2D" + oklFileSuffix;
+    if (isothermal)
+      kernelName = "cnsIsothermalInitialCondition2D";
     else
-      sprintf(kernelName, "cnsInitialCondition2D");
+      kernelName = "cnsInitialCondition2D";
   } else {
-    sprintf(fileName, DCNS "/okl/cnsInitialCondition3D.okl");
-    if (cns->isothermal)
-      sprintf(kernelName, "cnsIsothermalInitialCondition3D");
+    fileName   = oklFilePrefix + "cnsInitialCondition3D" + oklFileSuffix;
+    if (isothermal)
+      kernelName = "cnsIsothermalInitialCondition3D";
     else
-      sprintf(kernelName, "cnsInitialCondition3D");
+      kernelName = "cnsInitialCondition3D";
   }
 
-
-  cns->initialConditionKernel = platform.buildKernel(fileName, kernelName,
+  initialConditionKernel = platform.buildKernel(fileName, kernelName,
                                             kernelInfo);
 
-  sprintf(fileName, DCNS "/okl/cnsMaxWaveSpeed%s.okl", suffix);
-  if (cns->isothermal) {
-    sprintf(kernelName, "cnsIsothermalMaxWaveSpeed%s", suffix);
+  fileName   = oklFilePrefix + "cnsMaxWaveSpeed" + suffix + oklFileSuffix;
+  if (isothermal) {
+    kernelName = "cnsIsothermalMaxWaveSpeed" + suffix;
   } else {
-    sprintf(kernelName, "cnsMaxWaveSpeed%s", suffix);
+    kernelName = "cnsMaxWaveSpeed" + suffix;
   }
 
-  cns->maxWaveSpeedKernel = platform.buildKernel(fileName, kernelName,
+  maxWaveSpeedKernel = platform.buildKernel(fileName, kernelName,
                                             kernelInfo);
-
-  return *cns;
-}
-
-cns_t::~cns_t() {
-  volumeKernel.free();
-  surfaceKernel.free();
-  cubatureVolumeKernel.free();
-  cubatureSurfaceKernel.free();
-  gradVolumeKernel.free();
-  gradSurfaceKernel.free();
-  vorticityKernel.free();
-  constrainKernel.free();
-  initialConditionKernel.free();
-  maxWaveSpeedKernel.free();
-
-  if (timeStepper) delete timeStepper;
-  if (fieldTraceHalo) fieldTraceHalo->Free();
-  if (gradTraceHalo) gradTraceHalo->Free();
 }
diff --git a/solvers/cns/src/cnsStep.cpp b/solvers/cns/src/cnsStep.cpp
index c0f35a403..1544320a2 100644
--- a/solvers/cns/src/cnsStep.cpp
+++ b/solvers/cns/src/cnsStep.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,10 +26,10 @@ SOFTWARE.
 
 #include "cns.hpp"
 
-dfloat cns_t::MaxWaveSpeed(occa::memory& o_Q, const dfloat T){
+dfloat cns_t::MaxWaveSpeed(deviceMemory<dfloat>& o_Q, const dfloat T){
 
   //Note: if this is on the critical path in the future, we should pre-allocate this
-  occa::memory o_maxSpeed = platform.malloc(mesh.Nelements*sizeof(dfloat));
+  deviceMemory<dfloat> o_maxSpeed = platform.malloc<dfloat>(mesh.Nelements);
 
   maxWaveSpeedKernel(mesh.Nelements,
                      mesh.o_vgeo,
@@ -45,17 +45,16 @@ dfloat cns_t::MaxWaveSpeed(occa::memory& o_Q, const dfloat T){
                      o_Q,
                      o_maxSpeed);
 
-  const dfloat vmax = platform.linAlg.max(mesh.Nelements, o_maxSpeed, mesh.comm);
+  const dfloat vmax = platform.linAlg().max(mesh.Nelements, o_maxSpeed, mesh.comm);
 
-  o_maxSpeed.free();
   return vmax;
 }
 
 //evaluate ODE rhs = f(q,t)
-void cns_t::rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){
+void cns_t::rhsf(deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS, const dfloat T){
 
   // extract q trace halo and start exchange
-  fieldTraceHalo->ExchangeStart(o_Q, 1, ogs_dfloat);
+  fieldTraceHalo.ExchangeStart(o_Q, 1);
 
   // compute volume contributions to gradients
   gradVolumeKernel(mesh.Nelements,
@@ -65,7 +64,7 @@ void cns_t::rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){
                    o_gradq);
 
   // complete trace halo exchange
-  fieldTraceHalo->ExchangeFinish(o_Q, 1, ogs_dfloat);
+  fieldTraceHalo.ExchangeFinish(o_Q, 1);
 
   // compute surface contributions to gradients
   gradSurfaceKernel(mesh.Nelements,
@@ -84,7 +83,7 @@ void cns_t::rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){
                     o_gradq);
 
   // extract viscousStresses trace halo and start exchange
-  gradTraceHalo->ExchangeStart(o_gradq, 1, ogs_dfloat);
+  gradTraceHalo.ExchangeStart(o_gradq, 1);
 
   // compute volume contribution to cns RHS
   if (cubature) {
@@ -120,7 +119,7 @@ void cns_t::rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){
   }
 
   // complete trace halo exchange
-  gradTraceHalo->ExchangeFinish(o_gradq, 1, ogs_dfloat);
+  gradTraceHalo.ExchangeFinish(o_gradq, 1);
 
   if (cubature) {
       cubatureSurfaceKernel(mesh.Nelements,
diff --git a/solvers/elliptic/data/ellipticBoundary2D.h b/solvers/elliptic/data/ellipticBoundary2D.h
index dcd3a716e..0dcbb475d 100644
--- a/solvers/elliptic/data/ellipticBoundary2D.h
+++ b/solvers/elliptic/data/ellipticBoundary2D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/elliptic/data/ellipticBoundary3D.h b/solvers/elliptic/data/ellipticBoundary3D.h
index 722794907..1d4bfcdd3 100644
--- a/solvers/elliptic/data/ellipticBoundary3D.h
+++ b/solvers/elliptic/data/ellipticBoundary3D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/elliptic/data/ellipticHomogeneous2D.h b/solvers/elliptic/data/ellipticHomogeneous2D.h
index 15b4c8079..259525e9b 100644
--- a/solvers/elliptic/data/ellipticHomogeneous2D.h
+++ b/solvers/elliptic/data/ellipticHomogeneous2D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/elliptic/data/ellipticHomogeneous3D.h b/solvers/elliptic/data/ellipticHomogeneous3D.h
index 36a684078..603ccc2f0 100644
--- a/solvers/elliptic/data/ellipticHomogeneous3D.h
+++ b/solvers/elliptic/data/ellipticHomogeneous3D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/elliptic/data/ellipticSine2D.h b/solvers/elliptic/data/ellipticSine2D.h
index a170b990a..820575d65 100644
--- a/solvers/elliptic/data/ellipticSine2D.h
+++ b/solvers/elliptic/data/ellipticSine2D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/elliptic/data/ellipticSine3D.h b/solvers/elliptic/data/ellipticSine3D.h
index 574c786de..342c627a5 100644
--- a/solvers/elliptic/data/ellipticSine3D.h
+++ b/solvers/elliptic/data/ellipticSine3D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/elliptic/elliptic.hpp b/solvers/elliptic/elliptic.hpp
index 60533aa57..96ba5b972 100644
--- a/solvers/elliptic/elliptic.hpp
+++ b/solvers/elliptic/elliptic.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -38,22 +38,24 @@ SOFTWARE.
 
 #define DELLIPTIC LIBP_DIR"/solvers/elliptic/"
 
+using namespace libp;
+
 class ellipticSettings_t: public settings_t {
 public:
-  ellipticSettings_t(const MPI_Comm& _comm);
+  ellipticSettings_t() = default;
+  ellipticSettings_t(const comm_t& _comm);
   void report();
   void parseFromFile(platformSettings_t& platformSettings,
                      meshSettings_t& meshSettings,
-                     const string filename);
+                     const std::string filename);
 };
 void ellipticAddRunSettings(settings_t& settings);
 void ellipticAddSettings(settings_t& settings,
-                         const string prefix="");
+                         const std::string prefix="");
 
 class elliptic_t: public solver_t {
 public:
-  mesh_t &mesh;
-  linAlg_t &linAlg;
+  mesh_t mesh;
 
   dlong Ndofs, Nhalo;
   int Nfields;
@@ -63,66 +65,69 @@ class elliptic_t: public solver_t {
 
   int disc_ipdg, disc_c0;
 
-  occa::memory o_AqL;
+  deviceMemory<dfloat> o_AqL;
 
-  halo_t* traceHalo;
+  ogs::halo_t traceHalo;
 
-  precon_t* precon;
+  precon_t precon;
 
-  dfloat *grad;
-  occa::memory o_grad;
+  memory<dfloat> grad;
+  deviceMemory<dfloat> o_grad;
 
-  dfloat *weight, *weightG;
-  occa::memory o_weight, o_weightG;
+  memory<dfloat> weight, weightG;
+  deviceMemory<dfloat> o_weight, o_weightG;
 
   //C0-FEM mask data
-  ogs_t *ogsMasked;
-  int *mapB;      // boundary flag of face nodes
+  ogs::ogs_t ogsMasked;
+  ogs::halo_t gHalo;
+  memory<int> mapB;      // boundary flag of face nodes
+  deviceMemory<int> o_mapB;
 
   dlong Nmasked;
-  dlong *maskIds;
-  hlong *maskedGlobalIds;
-  hlong *maskedGlobalNumbering;
+  memory<dlong> maskIds;
+  memory<hlong> maskedGlobalIds;
+  memory<hlong> maskedGlobalNumbering;
+  memory<dlong> GlobalToLocal;
 
-  occa::memory o_maskIds;
-  occa::memory o_mapB;
+  deviceMemory<dlong> o_maskIds;
+  deviceMemory<dlong> o_GlobalToLocal;
 
-  int *BCType;
-  int *EToB;
-  occa::memory o_EToB;
+  int NBCTypes;
+  memory<int> BCType;
+  memory<int> EToB;
+  deviceMemory<int> o_EToB;
 
   int allNeumann;
   dfloat allNeumannPenalty;
   dfloat allNeumannScale;
 
-  occa::kernel maskKernel;
-  occa::kernel partialAxKernel;
-  occa::kernel partialGradientKernel;
-  occa::kernel partialIpdgKernel;
+  kernel_t maskKernel;
+  kernel_t partialAxKernel;
+  kernel_t partialGradientKernel;
+  kernel_t partialIpdgKernel;
 
-  elliptic_t() = delete;
+  elliptic_t() = default;
   elliptic_t(platform_t &_platform, mesh_t &_mesh,
-              settings_t& _settings, dfloat _lambda):
-    solver_t(_platform, _settings), mesh(_mesh),
-    linAlg(_platform.linAlg), lambda(_lambda) {}
-
-  ~elliptic_t();
+              settings_t& _settings, dfloat _lambda,
+              const int _NBCTypes, const memory<int> _BCType) {
+    Setup(_platform, _mesh, _settings, _lambda, _NBCTypes, _BCType);
+  }
 
   //setup
-  static elliptic_t& Setup(platform_t& platform, mesh_t& mesh,
-                           ellipticSettings_t& settings, dfloat lambda,
-                           const int NBCTypes, const int *BCType);
+  void Setup(platform_t& _platform, mesh_t& _mesh,
+             settings_t& _settings, dfloat _lambda,
+             const int _NBCTypes, const memory<int> _BCType);
 
   void BoundarySetup();
 
   void Run();
 
-  int Solve(linearSolver_t& linearSolver, occa::memory &o_x, occa::memory &o_r,
+  int Solve(linearSolver_t& linearSolver, deviceMemory<dfloat> &o_x, deviceMemory<dfloat> &o_r,
             const dfloat tol, const int MAXIT, const int verbose);
 
-  void PlotFields(dfloat* Q, char *fileName);
+  void PlotFields(memory<dfloat>& Q, std::string fileName);
 
-  void Operator(occa::memory& o_q, occa::memory& o_Aq);
+  void Operator(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_Aq);
 
   void BuildOperatorMatrixIpdg(parAlmond::parCOO& A);
   void BuildOperatorMatrixContinuous(parAlmond::parCOO& A);
@@ -141,27 +146,27 @@ class elliptic_t: public solver_t {
   void BuildOperatorMatrixIpdgTet3D(parAlmond::parCOO& A);
   void BuildOperatorMatrixIpdgHex3D(parAlmond::parCOO& A);
 
-  void BuildOperatorDiagonal(dfloat *diagA);
+  void BuildOperatorDiagonal(memory<dfloat>& diagA);
 
-  void BuildOperatorDiagonalContinuousTri2D(dfloat *diagA);
-  void BuildOperatorDiagonalContinuousTri3D(dfloat *diagA);
-  void BuildOperatorDiagonalContinuousQuad2D(dfloat *diagA);
-  void BuildOperatorDiagonalContinuousQuad3D(dfloat *diagA);
-  void BuildOperatorDiagonalContinuousTet3D(dfloat *diagA);
-  void BuildOperatorDiagonalContinuousHex3D(dfloat *diagA);
+  void BuildOperatorDiagonalContinuousTri2D(memory<dfloat>& diagA);
+  void BuildOperatorDiagonalContinuousTri3D(memory<dfloat>& diagA);
+  void BuildOperatorDiagonalContinuousQuad2D(memory<dfloat>& diagA);
+  void BuildOperatorDiagonalContinuousQuad3D(memory<dfloat>& diagA);
+  void BuildOperatorDiagonalContinuousTet3D(memory<dfloat>& diagA);
+  void BuildOperatorDiagonalContinuousHex3D(memory<dfloat>& diagA);
 
-  void BuildOperatorDiagonalIpdgTri2D(dfloat *diagA);
-  void BuildOperatorDiagonalIpdgTri3D(dfloat *diagA);
-  void BuildOperatorDiagonalIpdgQuad2D(dfloat *diagA);
-  void BuildOperatorDiagonalIpdgQuad3D(dfloat *diagA);
-  void BuildOperatorDiagonalIpdgTet3D(dfloat *diagA);
-  void BuildOperatorDiagonalIpdgHex3D(dfloat *diagA);
+  void BuildOperatorDiagonalIpdgTri2D(memory<dfloat>& diagA);
+  void BuildOperatorDiagonalIpdgTri3D(memory<dfloat>& diagA);
+  void BuildOperatorDiagonalIpdgQuad2D(memory<dfloat>& diagA);
+  void BuildOperatorDiagonalIpdgQuad3D(memory<dfloat>& diagA);
+  void BuildOperatorDiagonalIpdgTet3D(memory<dfloat>& diagA);
+  void BuildOperatorDiagonalIpdgHex3D(memory<dfloat>& diagA);
 
-  elliptic_t& SetupNewDegree(mesh_t& meshF);
+  elliptic_t SetupNewDegree(mesh_t& meshF);
 
-  elliptic_t* SetupRingPatch(mesh_t& meshPatch);
+  elliptic_t SetupRingPatch(mesh_t& meshPatch);
 
-  void ZeroMean(occa::memory &o_q);
+  void ZeroMean(deviceMemory<dfloat> &o_q);
 };
 
 
diff --git a/solvers/elliptic/ellipticMain.cpp b/solvers/elliptic/ellipticMain.cpp
index 5aa74d071..81977f4a8 100644
--- a/solvers/elliptic/ellipticMain.cpp
+++ b/solvers/elliptic/ellipticMain.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -29,48 +29,52 @@ SOFTWARE.
 int main(int argc, char **argv){
 
   // start up MPI
-  MPI_Init(&argc, &argv);
+  Comm::Init(argc, argv);
 
-  MPI_Comm comm = MPI_COMM_WORLD;
+  LIBP_ABORT("Usage: ./ellipticMain setupfile", argc!=2);
 
-  if(argc!=2)
-    LIBP_ABORT(string("Usage: ./ellipticMain setupfile"));
+  { /*Scope so everything is destructed before MPI_Finalize */
+    comm_t comm(Comm::World().Dup());
 
-  //create default settings
-  platformSettings_t platformSettings(comm);
-  meshSettings_t meshSettings(comm);
-  ellipticSettings_t ellipticSettings(comm);
-  ellipticAddRunSettings(ellipticSettings);
+    //create default settings
+    platformSettings_t platformSettings(comm);
+    meshSettings_t meshSettings(comm);
+    ellipticSettings_t ellipticSettings(comm);
+    ellipticAddRunSettings(ellipticSettings);
 
-  //load settings from file
-  ellipticSettings.parseFromFile(platformSettings, meshSettings,
-                                 argv[1]);
+    //load settings from file
+    ellipticSettings.parseFromFile(platformSettings, meshSettings,
+                                   argv[1]);
 
-  // set up platform
-  platform_t platform(platformSettings);
+    // set up platform
+    platform_t platform(platformSettings);
 
-  platformSettings.report();
-  meshSettings.report();
-  ellipticSettings.report();
+    platformSettings.report();
+    meshSettings.report();
+    ellipticSettings.report();
 
-  // set up mesh
-  mesh_t& mesh = mesh_t::Setup(platform, meshSettings, comm);
+    // set up mesh
+    mesh_t mesh(platform, meshSettings, comm);
 
-  dfloat lambda = 0.0;
-  ellipticSettings.getSetting("LAMBDA", lambda);
+    dfloat lambda = 0.0;
+    ellipticSettings.getSetting("LAMBDA", lambda);
 
-  // Boundary Type translation. Just defaults.
-  int NBCTypes = 3;
-  int BCType[NBCTypes] = {0,1,2};
+    // Boundary Type translation. Just defaults.
+    int NBCTypes = 3;
+    memory<int> BCType(3);
+    BCType[0] = 0;
+    BCType[1] = 1;
+    BCType[2] = 2;
 
-  // set up elliptic solver
-  elliptic_t& elliptic = elliptic_t::Setup(platform, mesh, ellipticSettings,
-                                           lambda, NBCTypes, BCType);
+    // set up elliptic solver
+    elliptic_t elliptic(platform, mesh, ellipticSettings,
+                        lambda, NBCTypes, BCType);
 
-  // run
-  elliptic.Run();
+    // run
+    elliptic.Run();
+  }
 
   // close down MPI
-  MPI_Finalize();
+  Comm::Finalize();
   return LIBP_SUCCESS;
 }
diff --git a/solvers/elliptic/ellipticPrecon.hpp b/solvers/elliptic/ellipticPrecon.hpp
index 0e735bd4b..65b1ae284 100644
--- a/solvers/elliptic/ellipticPrecon.hpp
+++ b/solvers/elliptic/ellipticPrecon.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -31,151 +31,112 @@ SOFTWARE.
 #include "parAlmond.hpp"
 
 //Jacobi preconditioner
-class JacobiPrecon: public precon_t {
+class JacobiPrecon: public operator_t {
 private:
-	elliptic_t& elliptic;
+	elliptic_t elliptic;
 
-  occa::memory o_invDiagA;
+  deviceMemory<dfloat> o_invDiagA;
 
 public:
+  JacobiPrecon() = default;
   JacobiPrecon(elliptic_t& elliptic);
-  void Operator(occa::memory& o_r, occa::memory& o_Mr);
+  void Operator(deviceMemory<dfloat>& o_r, deviceMemory<dfloat>& o_Mr);
 };
 
 //Inverse Mass Matrix preconditioner
-class MassMatrixPrecon: public precon_t {
+class MassMatrixPrecon: public operator_t {
 private:
-  elliptic_t& elliptic;
-  mesh_t& mesh;
-  settings_t& settings;
+  elliptic_t elliptic;
+  mesh_t mesh;
+  settings_t settings;
 
-  occa::memory o_MrL, o_rtmp;
-  occa::memory o_invMM;
+  deviceMemory<dfloat> o_MrL, o_rtmp;
+  deviceMemory<dfloat> o_invMM;
 
-  occa::kernel blockJacobiKernel;
-  occa::kernel partialBlockJacobiKernel;
+  kernel_t blockJacobiKernel;
+  kernel_t partialBlockJacobiKernel;
 
 public:
-  ~MassMatrixPrecon();
+  MassMatrixPrecon() = default;
   MassMatrixPrecon(elliptic_t& elliptic);
-  void Operator(occa::memory& o_r, occa::memory& o_Mr);
+  void Operator(deviceMemory<dfloat>& o_r, deviceMemory<dfloat>& o_Mr);
 };
 
 //ParAlmond AMG preconditioner
-class ParAlmondPrecon: public precon_t {
+class ParAlmondPrecon: public operator_t {
 private:
-  elliptic_t& elliptic;
-  settings_t& settings;
+  elliptic_t elliptic;
+  settings_t settings;
 
   parAlmond::parAlmond_t parAlmond;
 
-  dfloat *xG, *rhsG;
-  occa::memory o_xG, o_rhsG;
+  memory<dfloat> xG, rhsG;
+  deviceMemory<dfloat> o_xG, o_rhsG;
 
 public:
-  ~ParAlmondPrecon();
+  ParAlmondPrecon() = default;
   ParAlmondPrecon(elliptic_t& elliptic);
-  void Operator(occa::memory& o_r, occa::memory& o_Mr);
+  void Operator(deviceMemory<dfloat>& o_r, deviceMemory<dfloat>& o_Mr);
 };
 
 // Matrix-free p-Multigrid levels followed by AMG
-class MultiGridPrecon: public precon_t {
+class MultiGridPrecon: public operator_t {
 private:
-  elliptic_t& elliptic;
-  mesh_t& mesh;
-  settings_t& settings;
+  elliptic_t elliptic;
+  mesh_t mesh;
+  settings_t settings;
 
   parAlmond::parAlmond_t parAlmond;
 
 public:
+  MultiGridPrecon() = default;
   MultiGridPrecon(elliptic_t& elliptic);
-  ~MultiGridPrecon() = default;
-  void Operator(occa::memory& o_r, occa::memory& o_Mr);
+  void Operator(deviceMemory<dfloat>& o_r, deviceMemory<dfloat>& o_Mr);
 };
 
 // Cast problem into spectrally-equivalent N=1 FEM space and precondition with AMG
-class SEMFEMPrecon: public precon_t {
+class SEMFEMPrecon: public operator_t {
 private:
-  elliptic_t& elliptic;
-  mesh_t& mesh;
-  settings_t& settings;
+  elliptic_t elliptic;
+  mesh_t mesh;
+  settings_t settings;
 
-  mesh_t *femMesh;
-  elliptic_t* femElliptic;
+  mesh_t femMesh;
+  elliptic_t femElliptic;
   parAlmond::parAlmond_t parAlmond;
 
-  occa::memory o_MrL;
+  deviceMemory<dfloat> o_MrL;
 
-  occa::memory o_zFEM, o_rFEM;
-  occa::memory o_GzFEM, o_GrFEM;
+  deviceMemory<dfloat> o_zFEM, o_rFEM;
+  deviceMemory<dfloat> o_GzFEM, o_GrFEM;
 
-  ogs_t *FEMogs;
+  ogs::ogs_t FEMogs;
 
-  occa::kernel SEMFEMInterpKernel;
-  occa::kernel SEMFEMAnterpKernel;
+  kernel_t SEMFEMInterpKernel;
+  kernel_t SEMFEMAnterpKernel;
 
 public:
-  ~SEMFEMPrecon();
+  SEMFEMPrecon() = default;
   SEMFEMPrecon(elliptic_t& elliptic);
-  void Operator(occa::memory& o_r, occa::memory& o_Mr);
+  void Operator(deviceMemory<dfloat>& o_r, deviceMemory<dfloat>& o_Mr);
 };
 
-class MGLevel;
-// Overlapping additive Schwarz with patch problems consisting of the
-//  entire local mesh + 1 ring overlap, solved with a local multigrid
-//  precon and coarse problem consisting of the global degree 1
-//  problem, solved with parAlmond
-class OASPrecon: public precon_t {
-private:
-  elliptic_t& elliptic;
-  mesh_t& mesh;
-  settings_t& settings;
-
-  //Patch precon
-  mesh_t* meshPatch;
-  elliptic_t* ellipticPatch;
-  precon_t *preconPatch;
-  MGLevel *level;
-
-  ogs_t *ogsMaskedRing; //ogs for 1-ring patch
-
-  //Coarse Precon
-  ogs_t *ogsMasked=nullptr;
-  parAlmond::parAlmond_t parAlmond;
-
-  dfloat *rPatch, *zPatch;
-  dfloat *rPatchL, *zPatchL;
-  occa::memory o_rPatch, o_zPatch;
-  occa::memory o_rPatchL, o_zPatchL;
-
-  dfloat *rC, *zC;
-  occa::memory o_rC, o_zC;
-
-  dfloat *patchWeight;
-  occa::memory o_patchWeight;
-
-public:
-  ~OASPrecon();
-  OASPrecon(elliptic_t& elliptic);
-  void Operator(occa::memory& o_r, occa::memory& o_Mr);
-};
 
 class MGLevel: public parAlmond::multigridLevel {
 public:
-  elliptic_t& elliptic;
-  mesh_t& mesh;
-  linAlg_t& linAlg;
+  elliptic_t elliptic;
+  mesh_t mesh;
 
   //prologation
-  dfloat *P;
-  occa::memory o_P;
+  memory<dfloat> P;
+  deviceMemory<dfloat> o_P;
 
-  occa::kernel coarsenKernel, partialCoarsenKernel;
-  occa::kernel prolongateKernel, partialProlongateKernel;
+  kernel_t coarsenKernel, partialCoarsenKernel;
+  kernel_t prolongateKernel, partialProlongateKernel;
 
-  //coarse gather op
-  mesh_t *meshC=nullptr;
-  ogs_t *ogsMaskedC=nullptr;
+  //coarse space
+  elliptic_t ellipticC;
+  mesh_t meshC;
 
   //smoothing params
   typedef enum {JACOBI=1,
@@ -185,35 +146,35 @@ class MGLevel: public parAlmond::multigridLevel {
   dfloat lambda1, lambda0;
   int ChebyshevIterations;
 
-  static size_t smootherResidualBytes, scratchBytes;
-  static dfloat *smootherResidual;
-  static occa::memory o_smootherResidual;
-  static occa::memory o_smootherResidual2;
-  static occa::memory o_smootherUpdate;
-  static occa::memory o_transferScratch;
+  static dlong NsmootherResidual, Nscratch;
+  static memory<dfloat> smootherResidual;
+  static deviceMemory<dfloat> o_smootherResidual;
+  static deviceMemory<dfloat> o_smootherResidual2;
+  static deviceMemory<dfloat> o_smootherUpdate;
+  static deviceMemory<dfloat> o_transferScratch;
 
   //jacobi data
-  occa::memory o_invDiagA;
+  deviceMemory<dfloat> o_invDiagA;
 
   //build a p-multigrid level and connect it to the next one
+  MGLevel() = default;
   MGLevel(elliptic_t& _elliptic,
           dlong _Nrows, dlong _Ncols,
           int Nc, int NpCoarse);
-  ~MGLevel();
 
-  void Operator(occa::memory &o_X, occa::memory &o_Ax);
+  void Operator(deviceMemory<dfloat> &o_X, deviceMemory<dfloat> &o_Ax);
 
-  void residual(occa::memory &o_RHS, occa::memory &o_X, occa::memory &o_RES);
+  void residual(deviceMemory<dfloat> &o_RHS, deviceMemory<dfloat> &o_X, deviceMemory<dfloat> &o_RES);
 
-  void coarsen(occa::memory &o_X, occa::memory &o_Cx);
+  void coarsen(deviceMemory<dfloat> &o_X, deviceMemory<dfloat> &o_Cx);
 
-  void prolongate(occa::memory &o_X, occa::memory &o_Px);
+  void prolongate(deviceMemory<dfloat> &o_X, deviceMemory<dfloat> &o_Px);
 
   //smoother ops
-  void smooth(occa::memory &o_RHS, occa::memory &o_X, bool x_is_zero);
+  void smooth(deviceMemory<dfloat> &o_RHS, deviceMemory<dfloat> &o_X, bool x_is_zero);
 
-  void smoothJacobi    (occa::memory &o_r, occa::memory &o_X, bool xIsZero);
-  void smoothChebyshev (occa::memory &o_r, occa::memory &o_X, bool xIsZero);
+  void smoothJacobi    (deviceMemory<dfloat> &o_r, deviceMemory<dfloat> &o_X, bool xIsZero);
+  void smoothChebyshev (deviceMemory<dfloat> &o_r, deviceMemory<dfloat> &o_X, bool xIsZero);
 
   void Report();
 
@@ -223,5 +184,44 @@ class MGLevel: public parAlmond::multigridLevel {
   void AllocateStorage();
 };
 
+// Overlapping additive Schwarz with patch problems consisting of the
+//  entire local mesh + 1 ring overlap, solved with a local multigrid
+//  precon and coarse problem consisting of the global degree 1
+//  problem, solved with parAlmond
+class OASPrecon: public operator_t {
+private:
+  elliptic_t elliptic;
+  mesh_t mesh;
+  settings_t settings;
+
+  //Patch precon
+  mesh_t meshPatch;
+  elliptic_t ellipticPatch;
+  precon_t preconPatch;
+  MGLevel level;
+
+  ogs::ogs_t ogsMaskedRing; //ogs for 1-ring patch
+
+  //Coarse Precon
+  ogs::ogs_t ogsMasked;
+  parAlmond::parAlmond_t parAlmond;
+
+  memory<dfloat> rPatch, zPatch;
+  memory<dfloat> rPatchL, zPatchL;
+  deviceMemory<dfloat> o_rPatch, o_zPatch;
+  deviceMemory<dfloat> o_rPatchL, o_zPatchL;
+
+  memory<dfloat> rC, zC;
+  deviceMemory<dfloat> o_rC, o_zC;
+
+  memory<dfloat> patchWeight;
+  deviceMemory<dfloat> o_patchWeight;
+
+public:
+  OASPrecon() = default;
+  OASPrecon(elliptic_t& elliptic);
+  void Operator(deviceMemory<dfloat>& o_r, deviceMemory<dfloat>& o_Mr);
+};
+
 
-#endif
\ No newline at end of file
+#endif
diff --git a/solvers/elliptic/makefile b/solvers/elliptic/makefile
index 18183d2ee..8b9496233 100644
--- a/solvers/elliptic/makefile
+++ b/solvers/elliptic/makefile
@@ -2,7 +2,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
@@ -78,11 +78,8 @@ include ../../make.top
 endif
 endif
 
-#gslib
-GS_DIR=${LIBP_TPL_DIR}/gslib
-
 #libraries
-ELLIPTIC_LIBP_LIBS=parAlmond linearSolver mesh ogs linAlg core
+ELLIPTIC_LIBP_LIBS=parAlmond linearSolver mesh parAdogs ogs linAlg core
 
 #includes
 INCLUDES=${LIBP_INCLUDES} \
@@ -92,11 +89,10 @@ DEFINES =${LIBP_DEFINES} \
          -DLIBP_DIR='"${LIBP_DIR}"'
 
 #.cpp compilation flags
-ELLIPTIC_CXXFLAGS=${LIBP_MPICXXFLAGS} ${DEFINES} ${INCLUDES}
+ELLIPTIC_CXXFLAGS=${LIBP_CXXFLAGS} ${DEFINES} ${INCLUDES}
 
 #link libraries
 LIBS=-L${LIBP_LIBS_DIR} $(addprefix -l,$(ELLIPTIC_LIBP_LIBS)) \
-     -L$(GS_DIR)/lib -lgs \
      ${LIBP_LIBS}
 
 #link flags
@@ -144,10 +140,10 @@ endif
 # rule for .cpp files
 %.o: %.cpp $(DEPS) | libp_libs
 ifneq (,${verbose})
-	$(LIBP_MPICXX) -o $*.o -c $*.cpp $(ELLIPTIC_CXXFLAGS)
+	$(LIBP_CXX) -o $*.o -c $*.cpp $(ELLIPTIC_CXXFLAGS)
 else
 	@printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n";
-	@$(LIBP_MPICXX) -o $*.o -c $*.cpp $(ELLIPTIC_CXXFLAGS)
+	@$(LIBP_CXX) -o $*.o -c $*.cpp $(ELLIPTIC_CXXFLAGS)
 endif
 
 #cleanup
@@ -158,8 +154,7 @@ clean-libs: clean
 	${MAKE} -C ${LIBP_LIBS_DIR} clean
 
 clean-kernels: clean-libs
-# 	$(shell ${OCCA_DIR}/bin/occa clear all -y)
-	rm -rf ~/.occa/
+	rm -rf ${LIBP_DIR}/.occa/
 
 realclean: clean
 	${MAKE} -C ${LIBP_LIBS_DIR} realclean
diff --git a/solvers/elliptic/okl/ellipticAddBCHex3D.okl b/solvers/elliptic/okl/ellipticAddBCHex3D.okl
index 085e6e35b..634492c0e 100644
--- a/solvers/elliptic/okl/ellipticAddBCHex3D.okl
+++ b/solvers/elliptic/okl/ellipticAddBCHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/elliptic/okl/ellipticAddBCQuad2D.okl b/solvers/elliptic/okl/ellipticAddBCQuad2D.okl
index 5f0db75fd..17025be10 100644
--- a/solvers/elliptic/okl/ellipticAddBCQuad2D.okl
+++ b/solvers/elliptic/okl/ellipticAddBCQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -46,4 +46,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/elliptic/okl/ellipticAddBCQuad3D.okl b/solvers/elliptic/okl/ellipticAddBCQuad3D.okl
index f62e51948..8236ce41c 100644
--- a/solvers/elliptic/okl/ellipticAddBCQuad3D.okl
+++ b/solvers/elliptic/okl/ellipticAddBCQuad3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/elliptic/okl/ellipticAddBCTet3D.okl b/solvers/elliptic/okl/ellipticAddBCTet3D.okl
index 7a29b10fa..4d93151f4 100644
--- a/solvers/elliptic/okl/ellipticAddBCTet3D.okl
+++ b/solvers/elliptic/okl/ellipticAddBCTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/elliptic/okl/ellipticAddBCTri2D.okl b/solvers/elliptic/okl/ellipticAddBCTri2D.okl
index 2f905a125..b5f8e124a 100644
--- a/solvers/elliptic/okl/ellipticAddBCTri2D.okl
+++ b/solvers/elliptic/okl/ellipticAddBCTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/elliptic/okl/ellipticAxHex3D.okl b/solvers/elliptic/okl/ellipticAxHex3D.okl
index bfb2b7ca7..7a3ea1ce0 100644
--- a/solvers/elliptic/okl/ellipticAxHex3D.okl
+++ b/solvers/elliptic/okl/ellipticAxHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,6 +26,7 @@ SOFTWARE.
 
 
 @kernel void ellipticAxHex3D(const dlong Nelements,
+                             @restrict const  dfloat *  wJ,
                              @restrict const  dfloat *  ggeo,
                              @restrict const  dfloat *  DT,
                              @restrict const  dfloat *  S,
@@ -81,12 +82,10 @@ SOFTWARE.
             r_G12 = ggeo[gbase+p_G12ID*p_Np];
             r_G22 = ggeo[gbase+p_G22ID*p_Np];
 
-            r_GwJ = ggeo[gbase+p_GWJID*p_Np];
+            r_GwJ = wJ[e*p_Np + k*p_Nq*p_Nq + j*p_Nq + i];
           }
         }
 
-        @barrier("local");
-
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
 
@@ -102,8 +101,6 @@ SOFTWARE.
           }
         }
 
-        @barrier("local");
-
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
 
@@ -125,8 +122,6 @@ SOFTWARE.
           }
         }
 
-        @barrier("local");
-
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
 
@@ -163,6 +158,7 @@ SOFTWARE.
 @kernel void ellipticPartialAxHex3D_v0(const dlong Nelements,
                                     @restrict const  dlong  *  elementList,
                                     @restrict const  dlong  *  GlobalToLocal,
+                                    @restrict const  dfloat *  wJ,
                                     @restrict const  dfloat *  ggeo,
                                     @restrict const  dfloat *  DT,
                                     @restrict const  dfloat *  S,
@@ -226,11 +222,10 @@ SOFTWARE.
             r_G12 = ggeo[gbase+p_G12ID*p_Np];
             r_G22 = ggeo[gbase+p_G22ID*p_Np];
 
-            r_GwJ = ggeo[gbase+p_GWJID*p_Np];
+            r_GwJ = wJ[element*p_Np + k*p_Nq*p_Nq + j*p_Nq + i];
           }
         }
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -247,7 +242,6 @@ SOFTWARE.
           }
         }
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -270,7 +264,6 @@ SOFTWARE.
           }
         }
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -373,7 +366,6 @@ SOFTWARE.
             }
           }
 
-          @barrier("local");
 
           for(int j=0;j<p_Nq;++j;@inner(1)){
             for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -390,7 +382,6 @@ SOFTWARE.
             }
           }
 
-          @barrier("local");
 
           for(int j=0;j<p_Nq;++j;@inner(1)){
             for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -416,7 +407,6 @@ SOFTWARE.
             }
           }
 
-          @barrier("local");
 
           for(int j=0;j<p_Nq;++j;@inner(1)){
             for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -516,7 +506,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // Layer by layer
 //    #pragma unroll p_Nq
@@ -568,7 +557,6 @@ SOFTWARE.
           }
         }
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -585,7 +573,6 @@ SOFTWARE.
           }
         }
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -608,7 +595,6 @@ SOFTWARE.
           }
         }
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -703,7 +689,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -764,7 +749,6 @@ SOFTWARE.
           }
         }
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -781,7 +765,6 @@ SOFTWARE.
           }
         }
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -804,7 +787,6 @@ SOFTWARE.
           }
         }
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -915,7 +897,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -932,7 +913,6 @@ SOFTWARE.
 //    #pragma unroll p_Nq
       for(int k = 0;k < p_Nq; k++){
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -953,7 +933,6 @@ SOFTWARE.
           }
         }
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -1021,7 +1000,6 @@ SOFTWARE.
           }
         }
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -1203,7 +1181,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -1271,7 +1248,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int k=0;k<p_Nq;++k;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -1339,7 +1315,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int k=0;k<p_Nq;++k;@inner(1)){
         for(int j=0;j<p_Nq;++j;@inner(0)){
@@ -1423,7 +1398,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -1491,7 +1465,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int k=0;k<p_Nq;++k;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -1564,7 +1537,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
 
     for(int k=0;k<p_Nq;++k;@inner(2)){
diff --git a/solvers/elliptic/okl/ellipticAxIpdgHex3D.okl b/solvers/elliptic/okl/ellipticAxIpdgHex3D.okl
index aa77f2621..8bd04f4b1 100644
--- a/solvers/elliptic/okl/ellipticAxIpdgHex3D.okl
+++ b/solvers/elliptic/okl/ellipticAxIpdgHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -168,7 +168,6 @@ void surfaceTerms(const int emap,
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -189,7 +188,6 @@ void surfaceTerms(const int emap,
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -208,7 +206,6 @@ void surfaceTerms(const int emap,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -230,7 +227,6 @@ void surfaceTerms(const int emap,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -256,7 +252,6 @@ void surfaceTerms(const int emap,
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -277,7 +272,6 @@ void surfaceTerms(const int emap,
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -303,7 +297,6 @@ void surfaceTerms(const int emap,
       }
     }
 
-    @barrier("local");
 
     //layer by layer
     #pragma unroll p_Nq
@@ -335,7 +328,6 @@ void surfaceTerms(const int emap,
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -352,7 +344,6 @@ void surfaceTerms(const int emap,
         }
       }
 
-      @barrier("local");
     }
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -455,7 +446,6 @@ void surfaceTerms(const int emap,
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -476,7 +466,6 @@ void surfaceTerms(const int emap,
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -495,7 +484,6 @@ void surfaceTerms(const int emap,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -520,7 +508,6 @@ void surfaceTerms(const int emap,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -546,7 +533,6 @@ void surfaceTerms(const int emap,
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -567,7 +553,6 @@ void surfaceTerms(const int emap,
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -593,7 +578,6 @@ void surfaceTerms(const int emap,
       }
     }
 
-    @barrier("local");
 
     //layer by layer
     #pragma unroll p_Nq
@@ -625,7 +609,6 @@ void surfaceTerms(const int emap,
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -642,7 +625,6 @@ void surfaceTerms(const int emap,
         }
       }
 
-      @barrier("local");
     }
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
diff --git a/solvers/elliptic/okl/ellipticAxIpdgQuad2D.okl b/solvers/elliptic/okl/ellipticAxIpdgQuad2D.okl
index 258257a55..11a142654 100644
--- a/solvers/elliptic/okl/ellipticAxIpdgQuad2D.okl
+++ b/solvers/elliptic/okl/ellipticAxIpdgQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -153,7 +153,6 @@ void surfaceTerms(const int element,
     }
 
 #if 1
-    @barrier("local");
 
     // loop over faces to add pseudo-gradient
 
@@ -170,7 +169,6 @@ void surfaceTerms(const int element,
 
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(0)){
@@ -184,7 +182,6 @@ void surfaceTerms(const int element,
       surfaceTerms(e, sk3, 3, 0, j, tau, sgeo, vmapM, vmapP, EToB, gradq, s_dqdx, s_dqdy, s_rhsq);
     }
 #endif
-    @barrier("local");
 
     // prescale by geofacs
     for(int j=0;j<p_Nq;++j){
@@ -206,7 +203,6 @@ void surfaceTerms(const int element,
       }
     }
 
-    @barrier("local");
 
     // loop over slabs
     for(int j=0;j<p_Nq;++j){
@@ -270,7 +266,6 @@ void surfaceTerms(const int element,
       }
     }
 
-    @barrier("local");
 
     // loop over faces to add pseudo-gradient
 
@@ -286,7 +281,6 @@ void surfaceTerms(const int element,
       surfaceTerms(element, sk2, 2, i, p_Nq-1, tau, sgeo, vmapM, vmapP, EToB, gradq, s_dqdx, s_dqdy, s_rhsq);
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(0)){
@@ -300,7 +294,6 @@ void surfaceTerms(const int element,
       surfaceTerms(element, sk3, 3, 0, j, tau, sgeo, vmapM, vmapP, EToB, gradq, s_dqdx, s_dqdy, s_rhsq);
     }
 
-    @barrier("local");
 
     // prescale by geofacs
     for(int j=0;j<p_Nq;++j){
@@ -322,7 +315,6 @@ void surfaceTerms(const int element,
       }
     }
 
-    @barrier("local");
 
     // loop over slabs
     for(int j=0;j<p_Nq;++j){
diff --git a/solvers/elliptic/okl/ellipticAxIpdgQuad3D.okl b/solvers/elliptic/okl/ellipticAxIpdgQuad3D.okl
index f53f95028..2dabe00c8 100644
--- a/solvers/elliptic/okl/ellipticAxIpdgQuad3D.okl
+++ b/solvers/elliptic/okl/ellipticAxIpdgQuad3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -150,7 +150,6 @@ void surfaceTerms(const int element,
       }
     }
 
-    @barrier("local");
 
     // loop over faces to add pseudo-gradient
 
@@ -167,7 +166,6 @@ void surfaceTerms(const int element,
 
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(0)){
@@ -181,7 +179,6 @@ void surfaceTerms(const int element,
       surfaceTerms(e, sk3, 3, 0, j, tau, sgeo, vmapM, vmapP, gradq, s_dqdx, s_dqdy, s_dqdz, s_rhsq);
     }
 
-    @barrier("local");
 
     // prescale by geofacs
     for(int j=0;j<p_Nq;++j){
@@ -216,7 +213,6 @@ void surfaceTerms(const int element,
       }
     }
 
-    @barrier("local");
 
     // loop over slabs
     for(int j=0;j<p_Nq;++j){
@@ -282,7 +278,6 @@ void surfaceTerms(const int element,
       }
     }
 
-    @barrier("local");
 
     // loop over faces to add pseudo-gradient
 
@@ -298,7 +293,6 @@ void surfaceTerms(const int element,
       surfaceTerms(element, sk2, 2, i, p_Nq-1, tau, sgeo, vmapM, vmapP,gradq, s_dqdx, s_dqdy, s_dqdz, s_rhsq);
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(0)){
@@ -312,7 +306,6 @@ void surfaceTerms(const int element,
       surfaceTerms(element, sk3, 3, 0, j, tau, sgeo, vmapM, vmapP, gradq, s_dqdx, s_dqdy, s_dqdz, s_rhsq);
     }
 
-    @barrier("local");
 
     // prescale by geofacs
     for(int j=0;j<p_Nq;++j){
@@ -347,7 +340,6 @@ void surfaceTerms(const int element,
       }
     }
 
-    @barrier("local");
 
     // loop over slabs
     for(int j=0;j<p_Nq;++j){
diff --git a/solvers/elliptic/okl/ellipticAxIpdgTet3D.okl b/solvers/elliptic/okl/ellipticAxIpdgTet3D.okl
index c94154b40..75d13941b 100644
--- a/solvers/elliptic/okl/ellipticAxIpdgTet3D.okl
+++ b/solvers/elliptic/okl/ellipticAxIpdgTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -114,7 +114,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // dqdx += LIFT*(sJ/J)*nx*dq
     for(int n=0;n<p_Nmax;++n;@inner(0)){
@@ -156,7 +155,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Nmax;++n;@inner(0)){
       if(n<p_NfacesNfp){
@@ -179,7 +177,6 @@ SOFTWARE.
 
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Nmax;++n;@inner(0)){
       if(n<p_Np){
@@ -195,7 +192,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Nmax;++n;@inner(0)){
       if(n<p_Np){
@@ -300,7 +296,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // dqdx += LIFT*(sJ/J)*nx*dq
     for(int n=0;n<p_Nmax;++n;@inner(0)){
@@ -342,7 +337,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Nmax;++n;@inner(0)){
       if(n<p_NfacesNfp){
@@ -365,7 +359,6 @@ SOFTWARE.
 
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Nmax;++n;@inner(0)){
       if(n<p_Np){
@@ -381,7 +374,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Nmax;++n;@inner(0)){
       if(n<p_Np){
diff --git a/solvers/elliptic/okl/ellipticAxIpdgTri2D.okl b/solvers/elliptic/okl/ellipticAxIpdgTri2D.okl
index 242bbd997..a750d67cf 100644
--- a/solvers/elliptic/okl/ellipticAxIpdgTri2D.okl
+++ b/solvers/elliptic/okl/ellipticAxIpdgTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -103,7 +103,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // dqdx += LIFT*(sJ/J)*nx*dq
     for(int n=0;n<p_Nmax;++n;@inner(0)){
@@ -134,7 +133,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Nmax;++n;@inner(0)){
       if(n<p_NfacesNfp){
@@ -156,7 +154,6 @@ SOFTWARE.
 
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Nmax;++n;@inner(0)){
       if(n<p_Np){
@@ -172,7 +169,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Nmax;++n;@inner(0)){
 
@@ -349,7 +345,6 @@ SOFTWARE.
     }
   }
 
-    @barrier("local");
 
   for(int es=0;es<p_NbV;++es;@inner(1)){//
     for(int n=0;n<p_Nmax;++n;@inner(0)){
@@ -414,7 +409,6 @@ SOFTWARE.
     }
 
 
-    @barrier("local");
 
   for(int es=0;es<p_NbV;++es;@inner(1)){//
     for(int n=0;n<p_Nmax;++n;@inner(0)){
@@ -458,7 +452,6 @@ SOFTWARE.
     }
 
 
-    @barrier("local");
 
     for(int es=0;es<p_NbV;++es;@inner(1)){//
       for(int n=0;n<p_Nmax;++n;@inner(0)){
@@ -488,7 +481,6 @@ SOFTWARE.
     }
 
 
-    @barrier("local");
 
   for(int es=0;es<p_NbV;++es;@inner(1)){//
     for(int n=0;n<p_Nmax;++n;@inner(0)){
diff --git a/solvers/elliptic/okl/ellipticAxIpdgTri3D.okl b/solvers/elliptic/okl/ellipticAxIpdgTri3D.okl
index 27299d538..87216e04a 100644
--- a/solvers/elliptic/okl/ellipticAxIpdgTri3D.okl
+++ b/solvers/elliptic/okl/ellipticAxIpdgTri3D.okl
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -29,18 +29,18 @@
 // nx,ny,nz,sJ,invJ - need WsJ
 
 @kernel void ellipticAxIpdgTri3D(const dlong Nelements,
-				 @restrict const  dlong *  vmapM,
-				 @restrict const  dlong *  vmapP,
-				 const dfloat lambda,
-				 const dfloat tau,
-				 @restrict const  dfloat *  vgeo,
-				 @restrict const  dfloat *  sgeo,
-				 @restrict const  int   *  EToB,
-				 @restrict const  dfloat *  Dmatrices,
-				 @restrict const  dfloat *  LIFTT,
-				 @restrict const  dfloat *  MM,
-				 @restrict const  dfloat4 *  gradq,
-				 @restrict dfloat  *  Aq){
+                                 @restrict const  dlong *  vmapM,
+                                 @restrict const  dlong *  vmapP,
+                                 const dfloat lambda,
+                                 const dfloat tau,
+                                 @restrict const  dfloat *  vgeo,
+                                 @restrict const  dfloat *  sgeo,
+                                 @restrict const  int   *  EToB,
+                                 @restrict const  dfloat *  Dmatrices,
+                                 @restrict const  dfloat *  LIFTT,
+                                 @restrict const  dfloat *  MM,
+                                 @restrict const  dfloat4 *  gradq,
+                                 @restrict dfloat  *  Aq){
 
   for(dlong e=0;e<Nelements;++e;@outer(0)){
     @shared  dfloat s_dqdx[p_Np];
@@ -60,139 +60,135 @@
     for(int n=0;n<p_Nmax;++n;@inner(0)){
 
       if(n<p_Np){
-	// assume that this stores (qx, qy, qz, q) as dfloat4
-	const dfloat4 gradqn = gradq[e*p_Np+n];
+        // assume that this stores (qx, qy, qz, q) as dfloat4
+        const dfloat4 gradqn = gradq[e*p_Np+n];
 
-	s_dqdx[n] = gradqn.x;
-	s_dqdy[n] = gradqn.y;
-	s_dqdz[n] = gradqn.z;
-	s_lapq[n] = lambda*gradqn.w;
+        s_dqdx[n] = gradqn.x;
+        s_dqdy[n] = gradqn.y;
+        s_dqdz[n] = gradqn.z;
+        s_lapq[n] = lambda*gradqn.w;
       }
 
       if(n<p_NfacesNfp){
-	const dlong id  = n + e*p_Nfaces*p_Nfp;
-	idM = vmapM[id];
-	const dlong idP = vmapP[id];
-	// find face that owns this node
-	const int face = n/p_Nfp;
-
-	dfloat4 gradqM = gradq[idM];// could fetch from @shared after barrier
-	dfloat4 gradqP = gradq[idP];
-
-	// load surface geofactors for this face
-	dlong sid = p_Nsgeo*(e*p_Nfaces+face);
-	nx   = sgeo[sid+p_NXID];
-	ny   = sgeo[sid+p_NYID];
-	nz   = sgeo[sid+p_NZID];
-	sJ   = sgeo[sid+p_SJID];
-	invJ = sgeo[sid+p_IJID];
-	hinv = sgeo[sid+p_IHID];
-
-	const dfloat dq = gradqP.w - gradqM.w;
-	const dfloat hlf = 0.5f;
-
-	s_nxdq[n] = hlf*sJ*invJ*nx*dq;
-	s_nydq[n] = hlf*sJ*invJ*ny*dq;
-	s_nzdq[n] = hlf*sJ*invJ*nz*dq;
-
-	s_lapflux[n] = hlf*sJ*invJ*(-nx*(gradqP.x-gradqM.x)
-				    -ny*(gradqP.y-gradqM.y)
-				    -nz*(gradqP.z-gradqM.z)
-				    -tau*hinv*dq);
+        const dlong id  = n + e*p_Nfaces*p_Nfp;
+        idM = vmapM[id];
+        const dlong idP = vmapP[id];
+        // find face that owns this node
+        const int face = n/p_Nfp;
+
+        dfloat4 gradqM = gradq[idM];// could fetch from @shared after barrier
+        dfloat4 gradqP = gradq[idP];
+
+        // load surface geofactors for this face
+        dlong sid = p_Nsgeo*(e*p_Nfaces+face);
+        nx   = sgeo[sid+p_NXID];
+        ny   = sgeo[sid+p_NYID];
+        nz   = sgeo[sid+p_NZID];
+        sJ   = sgeo[sid+p_SJID];
+        invJ = sgeo[sid+p_IJID];
+        hinv = sgeo[sid+p_IHID];
+
+        const dfloat dq = gradqP.w - gradqM.w;
+        const dfloat hlf = 0.5f;
+
+        s_nxdq[n] = hlf*sJ*invJ*nx*dq;
+        s_nydq[n] = hlf*sJ*invJ*ny*dq;
+        s_nzdq[n] = hlf*sJ*invJ*nz*dq;
+
+        s_lapflux[n] = hlf*sJ*invJ*(-nx*(gradqP.x-gradqM.x)
+                                    -ny*(gradqP.y-gradqM.y)
+                                    -nz*(gradqP.z-gradqM.z)
+                                    -tau*hinv*dq);
       }
     }
 
-    @barrier("local");
 
     // dqdx += LIFT*(sJ/J)*nx*dq
     for(int n=0;n<p_Nmax;++n;@inner(0)){
 
       if(n<p_Np){
-	const dlong gid = e*p_Nvgeo;
-	const dfloat drdx = vgeo[gid + p_RXID];
-	const dfloat drdy = vgeo[gid + p_RYID];
-	const dfloat drdz = vgeo[gid + p_RZID];
-	const dfloat dsdx = vgeo[gid + p_SXID];
-	const dfloat dsdy = vgeo[gid + p_SYID];
-	const dfloat dsdz = vgeo[gid + p_SZID];
-
-	dfloat Lnxdq = 0;
-	dfloat Lnydq = 0;
-	dfloat Lnzdq = 0;
+        const dlong gid = e*p_Nvgeo;
+        const dfloat drdx = vgeo[gid + p_RXID];
+        const dfloat drdy = vgeo[gid + p_RYID];
+        const dfloat drdz = vgeo[gid + p_RZID];
+        const dfloat dsdx = vgeo[gid + p_SXID];
+        const dfloat dsdy = vgeo[gid + p_SYID];
+        const dfloat dsdz = vgeo[gid + p_SZID];
+
+        dfloat Lnxdq = 0;
+        dfloat Lnydq = 0;
+        dfloat Lnzdq = 0;
 
 #pragma unroll p_NfacesNfp
-	for(int i=0;i<p_NfacesNfp;++i){
-	  Lnxdq += LIFTT[n+i*p_Np]*s_nxdq[i];
-	  Lnydq += LIFTT[n+i*p_Np]*s_nydq[i];
-	  Lnzdq += LIFTT[n+i*p_Np]*s_nzdq[i];
-	}
-
-	dfloat dqdx = s_dqdx[n] + Lnxdq;
-	dfloat dqdy = s_dqdy[n] + Lnydq;
-	dfloat dqdz = s_dqdz[n] + Lnzdq;
-	s_dqdx[n] = drdx*dqdx + drdy*dqdy + drdz*dqdz; // abuse of notation
-	s_dqdy[n] = dsdx*dqdx + dsdy*dqdy + dsdz*dqdz;
-
-	s_Lnxdq[n] = Lnxdq;
-	s_Lnydq[n] = Lnydq;
-	s_Lnzdq[n] = Lnzdq;
+        for(int i=0;i<p_NfacesNfp;++i){
+          Lnxdq += LIFTT[n+i*p_Np]*s_nxdq[i];
+          Lnydq += LIFTT[n+i*p_Np]*s_nydq[i];
+          Lnzdq += LIFTT[n+i*p_Np]*s_nzdq[i];
+        }
+
+        dfloat dqdx = s_dqdx[n] + Lnxdq;
+        dfloat dqdy = s_dqdy[n] + Lnydq;
+        dfloat dqdz = s_dqdz[n] + Lnzdq;
+        s_dqdx[n] = drdx*dqdx + drdy*dqdy + drdz*dqdz; // abuse of notation
+        s_dqdy[n] = dsdx*dqdx + dsdy*dqdy + dsdz*dqdz;
+
+        s_Lnxdq[n] = Lnxdq;
+        s_Lnydq[n] = Lnydq;
+        s_Lnzdq[n] = Lnzdq;
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Nmax;++n;@inner(0)){
       if(n<p_NfacesNfp){
-	int id = idM%p_Np;
-	s_lapflux[n] += sJ*invJ*(nx*s_Lnxdq[id]+ny*s_Lnydq[id]+nz*s_Lnzdq[id]);
+        int id = idM%p_Np;
+        s_lapflux[n] += sJ*invJ*(nx*s_Lnxdq[id]+ny*s_Lnydq[id]+nz*s_Lnzdq[id]);
       }
 
       if(n<p_Np){
-	dfloat lapr = 0, laps = 0;
+        dfloat lapr = 0, laps = 0;
 
 #pragma unroll p_Np
-	for(int i=0;i<p_Np;++i){
-	  lapr += Dmatrices[n+i*p_Np+0*p_Np*p_Np]*s_dqdx[i];
-	  laps += Dmatrices[n+i*p_Np+1*p_Np*p_Np]*s_dqdy[i];
-	}
+        for(int i=0;i<p_Np;++i){
+          lapr += Dmatrices[n+i*p_Np+0*p_Np*p_Np]*s_dqdx[i];
+          laps += Dmatrices[n+i*p_Np+1*p_Np*p_Np]*s_dqdy[i];
+        }
 
-	s_lapq[n] -= (lapr+laps);
+        s_lapq[n] -= (lapr+laps);
       }
 
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Nmax;++n;@inner(0)){
       if(n<p_Np){
-	dfloat lap = 0;
+        dfloat lap = 0;
 
-	// lift remaining surface terms
+        // lift remaining surface terms
 #pragma unroll p_NfacesNfp
-	for(int i=0;i<p_NfacesNfp;++i){
-	  lap += LIFTT[n+i*p_Np]*s_lapflux[i];
-	}
+        for(int i=0;i<p_NfacesNfp;++i){
+          lap += LIFTT[n+i*p_Np]*s_lapflux[i];
+        }
 
-	s_lapq[n] += lap;
+        s_lapq[n] += lap;
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Nmax;++n;@inner(0)){
 
       if(n<p_Np){
-	const dfloat J = vgeo[e*p_Nvgeo + p_JID];
+        const dfloat J = vgeo[e*p_Nvgeo + p_JID];
 
-	dfloat Mlapq = 0;
+        dfloat Mlapq = 0;
 
-	// multiply by mass matrix
+        // multiply by mass matrix
 #pragma unroll p_Np
-	for(int i=0;i<p_Np;++i){
-	  Mlapq += MM[n+i*p_Np]*s_lapq[i];
-	}
+        for(int i=0;i<p_Np;++i){
+          Mlapq += MM[n+i*p_Np]*s_lapq[i];
+        }
 
-	Aq[n+e*p_Np] = J*Mlapq;
+        Aq[n+e*p_Np] = J*Mlapq;
       }
     }
   }
@@ -251,19 +247,19 @@
 
 // Added multiple element per threadblock
 @kernel void ellipticPartialAxIpdgTri3D(const dlong Nelements,
-					@restrict const  dlong *  elementList,
-					@restrict const  dlong *  vmapM,
-					@restrict const  dlong *  vmapP,
-					const dfloat lambda,
-					const dfloat tau,
-					@restrict const  dfloat *  vgeo,
-					@restrict const  dfloat *  sgeo,
-					@restrict const  int   *  EToB,
-					@restrict const  dfloat *  Dmatrices,
-					@restrict const  dfloat *  LIFTT,
-					@restrict const  dfloat *  MM,
-					@restrict const  dfloat4 *  gradq,
-					@restrict dfloat  *  Aq){
+                                        @restrict const  dlong *  elementList,
+                                        @restrict const  dlong *  vmapM,
+                                        @restrict const  dlong *  vmapP,
+                                        const dfloat lambda,
+                                        const dfloat tau,
+                                        @restrict const  dfloat *  vgeo,
+                                        @restrict const  dfloat *  sgeo,
+                                        @restrict const  int   *  EToB,
+                                        @restrict const  dfloat *  Dmatrices,
+                                        @restrict const  dfloat *  LIFTT,
+                                        @restrict const  dfloat *  MM,
+                                        @restrict const  dfloat4 *  gradq,
+                                        @restrict dfloat  *  Aq){
 
   for(dlong eo=0;eo<Nelements;eo+=(p_NbV*p_Nmt);@outer(0)){
     @shared  dfloat s_dqdx[p_Nmt][p_NbV][p_Np];
@@ -292,249 +288,245 @@
       for(int n=0;n<p_Nmax;++n;@inner(0)){
 
 #pragma unroll p_Nmt
-	for(int em=0;em<p_Nmt;++em){
-	  const dlong e = eo+es*p_Nmt + em;
-	  if(e<Nelements){
-	    element[em] = elementList[e];
-	  }
-	}
+        for(int em=0;em<p_Nmt;++em){
+          const dlong e = eo+es*p_Nmt + em;
+          if(e<Nelements){
+            element[em] = elementList[e];
+          }
+        }
 
 #pragma unroll p_Nmt
-	for(int em=0;em<p_Nmt;++em){
-	  const dlong e = eo+es*p_Nmt + em;
-	  //const int element = elementList[e];
-
-	  if(e<Nelements){
-	    if(n<p_Np){
-	      // assume that this stores (qx, qy, qz, q) as dfloat4
-
-	      const dfloat4 gradqn = gradq[element[em]*p_Np+n];
-
-	      s_dqdx[em][es][n] = gradqn.x;
-	      s_dqdy[em][es][n] = gradqn.y;
-	      s_dqdz[em][es][n] = gradqn.z;
-	      s_lapq[em][es][n] = lambda*gradqn.w;
-	    }
-
-	    if(n<p_NfacesNfp){
-	      const dlong id  = n + element[em]*p_Nfaces*p_Nfp;
-	      idM[em] = vmapM[id];
-	      const dlong idP = vmapP[id];
-	      // find face that owns this node
-	      const int face = n/p_Nfp;
-
-	      dfloat4 gradqM = gradq[idM[em]];// could fetch from @shared after barrier
-	      dfloat4 gradqP = gradq[idP];
-
-	      // load surface geofactors for this face
-	      dlong sid = p_Nsgeo*(element[em]*p_Nfaces+face);
-	      nx[em]   = sgeo[sid+p_NXID];
-	      ny[em]   = sgeo[sid+p_NYID];
-	      nz[em]   = sgeo[sid+p_NZID];
-	      sJ[em]   = sgeo[sid+p_SJID];
-	      invJ[em] = sgeo[sid+p_IJID];
-	      hinv[em] = sgeo[sid+p_IHID];
-
-	      const dfloat dq = gradqP.w - gradqM.w;
-	      const dfloat hlf = 0.5f;
-
-	      s_nxdq[em][es][n] = hlf*sJ[em]*invJ[em]*nx[em]*dq;
-	      s_nydq[em][es][n] = hlf*sJ[em]*invJ[em]*ny[em]*dq;
-	      s_nzdq[em][es][n] = hlf*sJ[em]*invJ[em]*nz[em]*dq;
-
-	      s_lapflux[em][es][n] = hlf*sJ[em]*invJ[em]*(-nx[em]*(gradqP.x-gradqM.x)
-							  -ny[em]*(gradqP.y-gradqM.y)
-							  -nz[em]*(gradqP.z-gradqM.z)
-							  -tau*hinv[em]*dq);
-	    }
-	  }
-	}
+        for(int em=0;em<p_Nmt;++em){
+          const dlong e = eo+es*p_Nmt + em;
+          //const int element = elementList[e];
+
+          if(e<Nelements){
+            if(n<p_Np){
+              // assume that this stores (qx, qy, qz, q) as dfloat4
+
+              const dfloat4 gradqn = gradq[element[em]*p_Np+n];
+
+              s_dqdx[em][es][n] = gradqn.x;
+              s_dqdy[em][es][n] = gradqn.y;
+              s_dqdz[em][es][n] = gradqn.z;
+              s_lapq[em][es][n] = lambda*gradqn.w;
+            }
+
+            if(n<p_NfacesNfp){
+              const dlong id  = n + element[em]*p_Nfaces*p_Nfp;
+              idM[em] = vmapM[id];
+              const dlong idP = vmapP[id];
+              // find face that owns this node
+              const int face = n/p_Nfp;
+
+              dfloat4 gradqM = gradq[idM[em]];// could fetch from @shared after barrier
+              dfloat4 gradqP = gradq[idP];
+
+              // load surface geofactors for this face
+              dlong sid = p_Nsgeo*(element[em]*p_Nfaces+face);
+              nx[em]   = sgeo[sid+p_NXID];
+              ny[em]   = sgeo[sid+p_NYID];
+              nz[em]   = sgeo[sid+p_NZID];
+              sJ[em]   = sgeo[sid+p_SJID];
+              invJ[em] = sgeo[sid+p_IJID];
+              hinv[em] = sgeo[sid+p_IHID];
+
+              const dfloat dq = gradqP.w - gradqM.w;
+              const dfloat hlf = 0.5f;
+
+              s_nxdq[em][es][n] = hlf*sJ[em]*invJ[em]*nx[em]*dq;
+              s_nydq[em][es][n] = hlf*sJ[em]*invJ[em]*ny[em]*dq;
+              s_nzdq[em][es][n] = hlf*sJ[em]*invJ[em]*nz[em]*dq;
+
+              s_lapflux[em][es][n] = hlf*sJ[em]*invJ[em]*(-nx[em]*(gradqP.x-gradqM.x)
+                                                          -ny[em]*(gradqP.y-gradqM.y)
+                                                          -nz[em]*(gradqP.z-gradqM.z)
+                                                          -tau*hinv[em]*dq);
+            }
+          }
+        }
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NbV;++es;@inner(1)){//
       for(int n=0;n<p_Nmax;++n;@inner(0)){
 
-	dfloat Lnxdq[p_Nmt], Lnydq[p_Nmt], Lnzdq[p_Nmt];
+        dfloat Lnxdq[p_Nmt], Lnydq[p_Nmt], Lnzdq[p_Nmt];
 
-	// // Try holding drdx in register array
-	// const dfloat drdx[p_Nmt], drdy[p_Nmt];
-	// const dfloat dsdx[p_Nmt], dsdy[p_Nmt];
+        // // Try holding drdx in register array
+        // const dfloat drdx[p_Nmt], drdy[p_Nmt];
+        // const dfloat dsdx[p_Nmt], dsdy[p_Nmt];
 
 #pragma unroll p_Nmt
-	for(int em=0;em<p_Nmt;++em){
-	  Lnxdq[em] = 0.f;
-	  Lnydq[em] = 0.f;
-	  Lnzdq[em] = 0.f;
-	  //
-	  //  const int e = eo+es*p_Nmt + em;
-	  //  const int gid = element*p_Nvgeo;
-	  // drdx[em] = vgeo[gid + p_RXID];
-	  // drdy[em] = vgeo[gid + p_RYID];
-	  // dsdx[em] = vgeo[gid + p_SXID];
-	  // dsdy[em] = vgeo[gid + p_SYID];
-	}
-
-
-	if(n<p_Np){
+        for(int em=0;em<p_Nmt;++em){
+          Lnxdq[em] = 0.f;
+          Lnydq[em] = 0.f;
+          Lnzdq[em] = 0.f;
+          //
+          //  const int e = eo+es*p_Nmt + em;
+          //  const int gid = element*p_Nvgeo;
+          // drdx[em] = vgeo[gid + p_RXID];
+          // drdy[em] = vgeo[gid + p_RYID];
+          // dsdx[em] = vgeo[gid + p_SXID];
+          // dsdy[em] = vgeo[gid + p_SYID];
+        }
+
+
+        if(n<p_Np){
 
 #pragma unroll p_NfacesNfp
-	  for(int i=0;i<p_NfacesNfp;++i){
-	    const dfloat L = LIFTT[n+i*p_Np];
+          for(int i=0;i<p_NfacesNfp;++i){
+            const dfloat L = LIFTT[n+i*p_Np];
 #pragma unroll p_Nmt
-	    for(int em=0;em<p_Nmt;++em){
-	      Lnxdq[em] += L*s_nxdq[em][es][i];
-	      Lnydq[em] += L*s_nydq[em][es][i];
-	      Lnzdq[em] += L*s_nzdq[em][es][i];
-	    }
-	  }
+            for(int em=0;em<p_Nmt;++em){
+              Lnxdq[em] += L*s_nxdq[em][es][i];
+              Lnydq[em] += L*s_nydq[em][es][i];
+              Lnzdq[em] += L*s_nzdq[em][es][i];
+            }
+          }
 
 #pragma unroll p_Nmt
-	  for(int em=0;em<p_Nmt;++em){
-	    const dlong e = eo+es*p_Nmt + em;
-	    if(e<Nelements){
-	      // const int element = elementList[e];
-	      const dlong gid = element[em]*p_Nvgeo;
-
-	      // These data can be stored on @shared
-	      const dfloat drdx = vgeo[gid + p_RXID];
-	      const dfloat drdy = vgeo[gid + p_RYID];
-	      const dfloat drdz = vgeo[gid + p_RZID];
-	      const dfloat dsdx = vgeo[gid + p_SXID];
-	      const dfloat dsdy = vgeo[gid + p_SYID];
-	      const dfloat dsdz = vgeo[gid + p_SZID];
-
-	      dfloat dqdx = s_dqdx[em][es][n] + Lnxdq[em];
-	      dfloat dqdy = s_dqdy[em][es][n] + Lnydq[em];
-	      dfloat dqdz = s_dqdz[em][es][n] + Lnzdq[em];
-	      s_dqdx[em][es][n] = drdx*dqdx + drdy*dqdy + drdz*dqdz; // abuse of notation
-	      s_dqdy[em][es][n] = dsdx*dqdx + dsdy*dqdy + dsdz*dqdz;
-
-	      s_Lnxdq[em][es][n] = Lnxdq[em];
-	      s_Lnydq[em][es][n] = Lnydq[em];
-	      s_Lnzdq[em][es][n] = Lnzdq[em];
-	    }
-	  }
-
-	}
+          for(int em=0;em<p_Nmt;++em){
+            const dlong e = eo+es*p_Nmt + em;
+            if(e<Nelements){
+              // const int element = elementList[e];
+              const dlong gid = element[em]*p_Nvgeo;
+
+              // These data can be stored on @shared
+              const dfloat drdx = vgeo[gid + p_RXID];
+              const dfloat drdy = vgeo[gid + p_RYID];
+              const dfloat drdz = vgeo[gid + p_RZID];
+              const dfloat dsdx = vgeo[gid + p_SXID];
+              const dfloat dsdy = vgeo[gid + p_SYID];
+              const dfloat dsdz = vgeo[gid + p_SZID];
+
+              dfloat dqdx = s_dqdx[em][es][n] + Lnxdq[em];
+              dfloat dqdy = s_dqdy[em][es][n] + Lnydq[em];
+              dfloat dqdz = s_dqdz[em][es][n] + Lnzdq[em];
+              s_dqdx[em][es][n] = drdx*dqdx + drdy*dqdy + drdz*dqdz; // abuse of notation
+              s_dqdy[em][es][n] = dsdx*dqdx + dsdy*dqdy + dsdz*dqdz;
+
+              s_Lnxdq[em][es][n] = Lnxdq[em];
+              s_Lnydq[em][es][n] = Lnydq[em];
+              s_Lnzdq[em][es][n] = Lnzdq[em];
+            }
+          }
+
+        }
       }
     }
 
 
-    @barrier("local");
 
     for(int es=0;es<p_NbV;++es;@inner(1)){//
       for(int n=0;n<p_Nmax;++n;@inner(0)){
 
-	dfloat lapr[p_Nmt], laps[p_Nmt];
+        dfloat lapr[p_Nmt], laps[p_Nmt];
 #pragma unroll p_Nmt
-	for(int em=0;em<p_Nmt;++em){
+        for(int em=0;em<p_Nmt;++em){
 
-	  if(n<p_NfacesNfp){
-	    int id = idM[em]%p_Np;
-	    s_lapflux[em][es][n] += sJ[em]*invJ[em]*(nx[em]*s_Lnxdq[em][es][id]
-						     +ny[em]*s_Lnydq[em][es][id]
-						     +nz[em]*s_Lnzdq[em][es][id]);
-	  }
+          if(n<p_NfacesNfp){
+            int id = idM[em]%p_Np;
+            s_lapflux[em][es][n] += sJ[em]*invJ[em]*(nx[em]*s_Lnxdq[em][es][id]
+                                                     +ny[em]*s_Lnydq[em][es][id]
+                                                     +nz[em]*s_Lnzdq[em][es][id]);
+          }
 
-	  lapr[em] = 0.f; 
-	  laps[em] = 0.f;
-	}
+          lapr[em] = 0.f;
+          laps[em] = 0.f;
+        }
 
-	if(n<p_Np){
+        if(n<p_Np){
 #pragma unroll p_Np
-	  for(int i=0;i<p_Np;++i){
-	    const dfloat drT = Dmatrices[n+i*p_Np+0*p_Np*p_Np];
-	    const dfloat dsT = Dmatrices[n+i*p_Np+1*p_Np*p_Np];
+          for(int i=0;i<p_Np;++i){
+            const dfloat drT = Dmatrices[n+i*p_Np+0*p_Np*p_Np];
+            const dfloat dsT = Dmatrices[n+i*p_Np+1*p_Np*p_Np];
 
 #pragma unroll p_Nmt
-	    for(int em=0;em<p_Nmt;++em){
-	      lapr[em] += drT*s_dqdx[em][es][i];
-	      laps[em] += dsT*s_dqdy[em][es][i];
-	    }
-	  }
+            for(int em=0;em<p_Nmt;++em){
+              lapr[em] += drT*s_dqdx[em][es][i];
+              laps[em] += dsT*s_dqdy[em][es][i];
+            }
+          }
 
 
-	  if(n<p_Np){
+          if(n<p_Np){
 #pragma unroll p_Nmt
-	    for(int em=0;em<p_Nmt;++em){
-	      s_lapq[em][es][n] -= (lapr[em]+laps[em]);
-	    }
-	  }
-	}
+            for(int em=0;em<p_Nmt;++em){
+              s_lapq[em][es][n] -= (lapr[em]+laps[em]);
+            }
+          }
+        }
       }
     }
 
 
-    @barrier("local");
 
     for(int es=0;es<p_NbV;++es;@inner(1)){//
       for(int n=0;n<p_Nmax;++n;@inner(0)){
-	dfloat lap[p_Nmt];
+        dfloat lap[p_Nmt];
 #pragma unroll p_Nmt
-	for(int em=0;em<p_Nmt;++em){
-	  lap[em] = 0.f;
-	}
+        for(int em=0;em<p_Nmt;++em){
+          lap[em] = 0.f;
+        }
 
-	if(n<p_Np){
-	  // lift remaining surface terms
+        if(n<p_Np){
+          // lift remaining surface terms
 #pragma unroll p_NfacesNfp
-	  for(int i=0;i<p_NfacesNfp;++i){
-	    const dfloat L = LIFTT[n+i*p_Np];
+          for(int i=0;i<p_NfacesNfp;++i){
+            const dfloat L = LIFTT[n+i*p_Np];
 #pragma unroll p_Nmt
-	    for(int em=0;em<p_Nmt;++em){
-	      lap[em] += L*s_lapflux[em][es][i];
-	    }
-	  }
+            for(int em=0;em<p_Nmt;++em){
+              lap[em] += L*s_lapflux[em][es][i];
+            }
+          }
 
 #pragma unroll p_Nmt
-	  for(int em=0;em<p_Nmt;++em){
-	    s_lapq[em][es][n] += lap[em];
-	  } 
-	} 
+          for(int em=0;em<p_Nmt;++em){
+            s_lapq[em][es][n] += lap[em];
+          }
+        }
       }
     }
 
 
-    @barrier("local");
 
     for(int es=0;es<p_NbV;++es;@inner(1)){//
       for(int n=0;n<p_Nmax;++n;@inner(0)){
 
-	dfloat Mlapq[p_Nmt];
+        dfloat Mlapq[p_Nmt];
 
 #pragma unroll p_Nmt
-	for(int em=0;em<p_Nmt;++em){
-	  Mlapq[em] = 0.f;
-	}
+        for(int em=0;em<p_Nmt;++em){
+          Mlapq[em] = 0.f;
+        }
 
-	if(n<p_Np){
+        if(n<p_Np){
 
 #pragma unroll p_Np
-	  for(int i=0;i<p_Np;++i){
-	    const dfloat mm = MM[n+i*p_Np];
+          for(int i=0;i<p_Np;++i){
+            const dfloat mm = MM[n+i*p_Np];
 #pragma unroll p_Nmt
-	    for(int em=0;em<p_Nmt;++em){
-	      Mlapq[em] += mm*s_lapq[em][es][i];
-	    }
-	  }
+            for(int em=0;em<p_Nmt;++em){
+              Mlapq[em] += mm*s_lapq[em][es][i];
+            }
+          }
 
 #pragma unroll p_Nmt
-	  for(int em=0;em<p_Nmt;++em){
-	    const dlong e = eo+es*p_Nmt + em;
+          for(int em=0;em<p_Nmt;++em){
+            const dlong e = eo+es*p_Nmt + em;
 
-	    if(e<Nelements){
-	      // const int element = elementList[e];
-	      const dfloat J = vgeo[element[em]*p_Nvgeo + p_JID];
+            if(e<Nelements){
+              // const int element = elementList[e];
+              const dfloat J = vgeo[element[em]*p_Nvgeo + p_JID];
 
-	      Aq[n+element[em]*p_Np] = J*Mlapq[em];
-	    }
-	  }
-	}
+              Aq[n+element[em]*p_Np] = J*Mlapq[em];
+            }
+          }
+        }
       }
     }
   }
 }
-	
+
diff --git a/solvers/elliptic/okl/ellipticAxQuad2D.okl b/solvers/elliptic/okl/ellipticAxQuad2D.okl
index 28b2f71ec..94f20df66 100644
--- a/solvers/elliptic/okl/ellipticAxQuad2D.okl
+++ b/solvers/elliptic/okl/ellipticAxQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -33,6 +33,7 @@ SOFTWARE.
 
 // square thread version
 @kernel void ellipticAxQuad2D(const dlong   Nelements,
+                               @restrict const  dfloat *  wJ,
                                @restrict const  dfloat *  ggeo,
                                @restrict const  dfloat *  DT,
                                @restrict const  dfloat *  S,
@@ -59,13 +60,12 @@ SOFTWARE.
       s_DT[j][i] = DT[j*p_Nq+i];
     }
 
-    @barrier("local");
 
     squareThreads{
       const dlong base = e*p_Nggeo*p_Np + j*p_Nq + i;
 
       // assumes w*J built into G entries
-      r_GwJ = ggeo[base+p_GWJID*p_Np];
+      r_GwJ = wJ[e*p_Np + j*p_Nq + i];
 
       r_G00 = ggeo[base+p_G00ID*p_Np];
       r_G01 = ggeo[base+p_G01ID*p_Np];
@@ -86,13 +86,11 @@ SOFTWARE.
     }
 
     // r term ----->
-    @barrier("local");
 
     squareThreads{
       s_q[j][i] = r_G00*r_qr + r_G01*r_qs;
     }
 
-    @barrier("local");
 
     squareThreads{
       dfloat tmp = 0.f;
@@ -105,13 +103,11 @@ SOFTWARE.
     }
 
     // s term ---->
-    @barrier("local");
 
     squareThreads{
       s_q[j][i] = r_G01*r_qr + r_G11*r_qs;
     }
 
-    @barrier("local");
 
     squareThreads{
       dfloat tmp = 0.f;
@@ -133,6 +129,7 @@ SOFTWARE.
 @kernel void ellipticPartialAxQuad2D(const dlong Nelements,
                                    @restrict const  dlong   *  elementList,
                                    @restrict const  dlong   *  GlobalToLocal,
+                                   @restrict const  dfloat *  wJ,
                                    @restrict const  dfloat *  ggeo,
                                    @restrict const  dfloat *  DT,
                                    @restrict const  dfloat *  S,
@@ -161,14 +158,13 @@ SOFTWARE.
       s_DT[j][i] = DT[j*p_Nq+i];
     }
 
-    @barrier("local");
 
     squareThreads{
 
       const dlong base = element*p_Nggeo*p_Np + j*p_Nq + i;
 
       // assumes w*J built into G entries
-      r_GwJ = ggeo[base+p_GWJID*p_Np];
+      r_GwJ = wJ[element*p_Np + j*p_Nq + i];
 
       r_G00 = ggeo[base+p_G00ID*p_Np];
       r_G01 = ggeo[base+p_G01ID*p_Np];
@@ -189,13 +185,11 @@ SOFTWARE.
     }
 
     // r term ----->
-    @barrier("local");
 
     squareThreads{
       s_q[j][i] = r_G00*r_qr + r_G01*r_qs;
     }
 
-    @barrier("local");
 
     squareThreads{
       dfloat tmp = 0.f;
@@ -208,13 +202,11 @@ SOFTWARE.
     }
 
     // s term ---->
-    @barrier("local");
 
     squareThreads{
       s_q[j][i] = r_G01*r_qr + r_G11*r_qs;
     }
 
-    @barrier("local");
 
     squareThreads{
       dfloat tmp = 0.f;
diff --git a/solvers/elliptic/okl/ellipticAxQuad3D.okl b/solvers/elliptic/okl/ellipticAxQuad3D.okl
index 8c1323a4e..d953f3a36 100644
--- a/solvers/elliptic/okl/ellipticAxQuad3D.okl
+++ b/solvers/elliptic/okl/ellipticAxQuad3D.okl
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -27,19 +27,20 @@
 
 // hex @kernel for screened coulomb potential mat-vec
 #define squareThreads                           \
-  for(int j=0; j<p_Nq; ++j; @inner(1))		\
+  for(int j=0; j<p_Nq; ++j; @inner(1))          \
     for(int i=0; i<p_Nq; ++i; @inner(0))
 
 
 // square thread version
 @kernel void ellipticAxQuad3D(const dlong   Nelements,
-			      @restrict const  dfloat *  ggeo,
-			      @restrict const  dfloat *  D,
-			      @restrict const  dfloat *  S,
-			      @restrict const  dfloat *  MM,
-			      const dfloat   lambda,
-			      @restrict const  dfloat *  q,
-			      @restrict dfloat *  Aq){
+                              @restrict const  dfloat *  wJ,
+                              @restrict const  dfloat *  ggeo,
+                              @restrict const  dfloat *  D,
+                              @restrict const  dfloat *  S,
+                              @restrict const  dfloat *  MM,
+                              const dfloat   lambda,
+                              @restrict const  dfloat *  q,
+                              @restrict dfloat *  Aq){
 
   for(dlong e=0;e<Nelements;++e;@outer(0)){
 
@@ -62,13 +63,12 @@
       s_D[j][i] = D[j*p_Nq+i];
     }
 
-    @barrier("local");
 
     squareThreads{
       const dlong base = e*p_Nggeo*p_Np + j*p_Nq + i;
 
       // assumes w*J built into G entries
-      r_GwJ = ggeo[base+p_GWJID*p_Np];
+      r_GwJ = wJ[e*p_Np + j*p_Nq + i];
 
       r_G00 = ggeo[base+p_G00ID*p_Np];
       r_G01 = ggeo[base+p_G01ID*p_Np];
@@ -81,8 +81,8 @@
 
 #pragma unroll p_Nq
       for(int n=0; n<p_Nq; ++n){
-	qr += s_D[i][n]*s_q[j][n];
-	qs += s_D[j][n]*s_q[n][i];
+        qr += s_D[i][n]*s_q[j][n];
+        qs += s_D[j][n]*s_q[n][i];
       }
 
       r_qr = qr; r_qs = qs; r_q = s_q[j][i];
@@ -91,20 +91,18 @@
     }
 
     // r term ----->
-    @barrier("local");
 
     squareThreads{
       // s_q[j][i] = r_G00*r_qr + r_G01*r_qs + r_G02*r_q;
       s_q[j][i] = r_G00*r_qr + r_G01*r_qs;
     }
 
-    @barrier("local");
 
     squareThreads{
       dfloat tmp = 0.f;
 #pragma unroll p_Nq
       for(int n=0;n<p_Nq;++n) {
-	       tmp += s_D[n][i]*s_q[j][n];
+               tmp += s_D[n][i]*s_q[j][n];
       }
       r_Aq += tmp;
     }
@@ -116,14 +114,12 @@
     // }
 
     // s term ---->
-    @barrier("local");
 
     squareThreads{
       // s_q[j][i] = r_G01*r_qr + r_G11*r_qs + 0.f*r_G12*r_q;
       s_q[j][i] = r_G01*r_qr + r_G11*r_qs;
     }
 
-    @barrier("local");
 
     squareThreads{
       // dfloat tmp = r_G22*r_q;
@@ -131,7 +127,7 @@
 
 #pragma unroll p_Nq
       for(int n=0;n<p_Nq;++n) {
-	     tmp += s_D[n][j]*s_q[n][i];
+             tmp += s_D[n][j]*s_q[n][i];
       }
 
       r_Aq += tmp;
@@ -144,15 +140,16 @@
 
 // square thread version
 @kernel void ellipticPartialAxQuad3D(const dlong Nelements,
-				     @restrict const  dlong   *  elementList,
-             @restrict const  dlong   *  GlobalToLocal,
-				     @restrict const  dfloat *  ggeo,
-				     @restrict const  dfloat *  D,
-				     @restrict const  dfloat *  S,
-				     @restrict const  dfloat *  MM,
-				     const dfloat   lambda,
-				     @restrict const  dfloat *  q,
-				     @restrict dfloat *  Aq){
+                                     @restrict const  dlong   *  elementList,
+                                     @restrict const  dlong   *  GlobalToLocal,
+                                     @restrict const  dfloat *  wJ,
+                                     @restrict const  dfloat *  ggeo,
+                                     @restrict const  dfloat *  D,
+                                     @restrict const  dfloat *  S,
+                                     @restrict const  dfloat *  MM,
+                                     const dfloat   lambda,
+                                     @restrict const  dfloat *  q,
+                                     @restrict dfloat *  Aq){
 
   for(dlong e=0;e<Nelements;++e;@outer(0)){
 
@@ -175,13 +172,12 @@
       s_D[j][i] = D[j*p_Nq+i];
     }
 
-    @barrier("local");
 
     squareThreads{
 
       const dlong base = element*p_Nggeo*p_Np + j*p_Nq + i;
       // assumes w*J built into G entries
-      r_GwJ = ggeo[base+p_GWJID*p_Np];
+      r_GwJ = wJ[element*p_Np + j*p_Nq + i];
       r_G00 = ggeo[base+p_G00ID*p_Np];
       r_G01 = ggeo[base+p_G01ID*p_Np];
       r_G02 = ggeo[base+p_G02ID*p_Np];
@@ -193,8 +189,8 @@
 
 #pragma unroll p_Nq
       for(int n=0; n<p_Nq; ++n){
-	qr += s_D[i][n]*s_q[j][n];
-	qs += s_D[j][n]*s_q[n][i];
+        qr += s_D[i][n]*s_q[j][n];
+        qs += s_D[j][n]*s_q[n][i];
       }
 
       r_qr = qr; r_qs = qs; r_q = s_q[j][i];
@@ -203,20 +199,18 @@
     }
 
     // r term ----->
-    @barrier("local");
 
     squareThreads{
       // s_q[j][i] =  r_G00*r_qr + r_G01*r_qs + r_G02*r_q;
       s_q[j][i] =  r_G00*r_qr + r_G01*r_qs;
     }
 
-    @barrier("local");
 
     squareThreads{
       dfloat tmp = 0.f;
 #pragma unroll p_Nq
       for(int n=0;n<p_Nq;++n) {
-	tmp += s_D[n][i]*s_q[j][n];
+        tmp += s_D[n][i]*s_q[j][n];
       }
 
       r_Aq += tmp;
@@ -229,14 +223,12 @@
     // }
 
     // s term ---->
-    @barrier("local");
 
     squareThreads{
       // s_q[j][i] = r_G01*r_qr + r_G11*r_qs + r_G12*r_q;
       s_q[j][i] = r_G01*r_qr + r_G11*r_qs;
     }
 
-    @barrier("local");
 
     squareThreads{
       // dfloat tmp = r_G22*r_q;
@@ -244,7 +236,7 @@
 
 #pragma unroll p_Nq
       for(int n=0;n<p_Nq;++n){
-	       tmp += s_D[n][j]*s_q[n][i];
+               tmp += s_D[n][j]*s_q[n][i];
       }
 
       r_Aq += tmp;
diff --git a/solvers/elliptic/okl/ellipticAxTet3D.okl b/solvers/elliptic/okl/ellipticAxTet3D.okl
index 172029ff6..8a00185a7 100644
--- a/solvers/elliptic/okl/ellipticAxTet3D.okl
+++ b/solvers/elliptic/okl/ellipticAxTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,6 +26,7 @@ SOFTWARE.
 
 
 @kernel void ellipticAxTet3D(const dlong Nelements,
+                            @restrict const  dfloat *  wJ,
                             @restrict const  dfloat *  ggeo,
                             @restrict const  dfloat *  D,
                             @restrict const  dfloat *  S,
@@ -44,7 +45,6 @@ SOFTWARE.
       s_q[n] = q[id];
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Np;++n;@inner(0)){
       const dlong gid = e*p_Nggeo;
@@ -55,7 +55,7 @@ SOFTWARE.
       const dfloat Gss = ggeo[gid + p_G11ID];
       const dfloat Gst = ggeo[gid + p_G12ID];
       const dfloat Gtt = ggeo[gid + p_G22ID];
-      const dfloat J   = ggeo[gid + p_GWJID];
+      const dfloat J   = wJ[e];
 
       dfloat qrr = 0.;
       dfloat qrs = 0.;
@@ -89,6 +89,7 @@ SOFTWARE.
 
 @kernel void ellipticPartialAxTet3D_v0(const dlong Nelements,
                                   @restrict const  dlong   *  elementList,
+                                  @restrict const  dfloat *  wJ,
                                   @restrict const  dfloat *  ggeo,
                                   @restrict const  dfloat *  D,
                                   @restrict const  dfloat *  S,
@@ -108,7 +109,6 @@ SOFTWARE.
       s_q[n] = q[id];
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Np;++n;@inner(0)){
       const dlong element = elementList[e];
@@ -120,7 +120,7 @@ SOFTWARE.
       const dfloat Gss = ggeo[gid + p_G11ID];
       const dfloat Gst = ggeo[gid + p_G12ID];
       const dfloat Gtt = ggeo[gid + p_G22ID];
-      const dfloat J   = ggeo[gid + p_GWJID];
+      const dfloat J   = wJ[element];
 
       dfloat qrr = 0.;
       dfloat qrs = 0.;
@@ -187,6 +187,7 @@ SOFTWARE.
 @kernel void ellipticPartialAxTet3D(const dlong Nelements,
                                   @restrict const  dlong   *  elementList,
                                   @restrict const  dlong   *  GlobalToLocal,
+                                  @restrict const  dfloat *  wJ,
                                   @restrict const  dfloat *  ggeo,
                                   @restrict const  dfloat *  D,
                                   @restrict const  dfloat *  S,
@@ -202,6 +203,7 @@ SOFTWARE.
 
     @shared dfloat s_q[p_Ne][p_Nb][p_Np];
     @shared dfloat s_ggeo[p_Ne][p_Nb][p_Nggeo];
+    @shared dfloat s_wJ[p_Ne][p_Nb];
 
     @exclusive dlong element[p_Ne];
 
@@ -224,12 +226,12 @@ SOFTWARE.
               s_ggeo[et][b][m] = ggeo[element[et]*p_Nggeo+m];
               m += p_Np;
             }
+            s_wJ[et][b] = wJ[element[et]];
           }
         }
       }
     }
 
-    @barrier("local");
 
     for(int b=0;b<p_Nb;++b;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
@@ -280,7 +282,7 @@ SOFTWARE.
             const dfloat Gss = s_ggeo[et][b][p_G11ID];
             const dfloat Gst = s_ggeo[et][b][p_G12ID];
             const dfloat Gtt = s_ggeo[et][b][p_G22ID];
-            const dfloat J   = s_ggeo[et][b][p_GWJID];
+            const dfloat J   = s_wJ[et][b];
 
             const dlong id = n + element[et]*p_Np;
 
diff --git a/solvers/elliptic/okl/ellipticAxTri2D.okl b/solvers/elliptic/okl/ellipticAxTri2D.okl
index 7f12e4d5e..04a918b2c 100644
--- a/solvers/elliptic/okl/ellipticAxTri2D.okl
+++ b/solvers/elliptic/okl/ellipticAxTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,6 +26,7 @@ SOFTWARE.
 
 
 @kernel void ellipticAxTri2D(const dlong Nelements,
+                            @restrict const  dfloat *  wJ,
                             @restrict const  dfloat *  ggeo,
                             @restrict const  dfloat *  D,
                             @restrict const  dfloat *  S,
@@ -48,7 +49,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
 
     for(dlong e=eo;e<eo+p_NblockV;++e;@inner(1)){
@@ -60,7 +60,7 @@ SOFTWARE.
           const dfloat Grr = ggeo[gid + p_G00ID];
           const dfloat Grs = ggeo[gid + p_G01ID];
           const dfloat Gss = ggeo[gid + p_G11ID];
-          const dfloat J   = ggeo[gid + p_GWJID];
+          const dfloat J   = wJ[e];
 
           dfloat qrr = 0.;
           dfloat qrs = 0.;
@@ -97,6 +97,7 @@ SOFTWARE.
 @kernel void ellipticPartialAxTri2D(const dlong Nelements,
                                     @restrict const  dlong   *  elementList,
                                     @restrict const  dlong   *  GlobalToLocal,
+                                    @restrict const  dfloat *  wJ,
                                     @restrict const  dfloat *  ggeo,
                                     @restrict const  dfloat *  D,
                                     @restrict const  dfloat *  S,
@@ -121,7 +122,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
 
     for(dlong e=eo;e<eo+p_NblockV;++e;@inner(1)){
@@ -134,7 +134,7 @@ SOFTWARE.
           const dfloat Grr = ggeo[gid + p_G00ID];
           const dfloat Grs = ggeo[gid + p_G01ID];
           const dfloat Gss = ggeo[gid + p_G11ID];
-          const dfloat J   = ggeo[gid + p_GWJID];
+          const dfloat J   = wJ[element];
 
           dfloat qrr = 0.;
           dfloat qrs = 0.;
diff --git a/solvers/elliptic/okl/ellipticAxTri3D.okl b/solvers/elliptic/okl/ellipticAxTri3D.okl
index 5cc5a4db8..8c3c8beac 100644
--- a/solvers/elliptic/okl/ellipticAxTri3D.okl
+++ b/solvers/elliptic/okl/ellipticAxTri3D.okl
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -26,6 +26,7 @@
 
 
 @kernel void ellipticAxTri3D(const dlong Nelements,
+                             @restrict const  dfloat *  wJ,
                              @restrict const  dfloat *  ggeo,
                              @restrict const  dfloat *  Dmatrices,
                              @restrict const  dfloat *  Smatrices,
@@ -48,7 +49,6 @@
       }
     }
 
-    @barrier("local");
 
 
     for(dlong e=eo;e<eo+p_NblockV;++e;@inner(1)){
@@ -60,7 +60,7 @@
           const dfloat Grr = ggeo[gid + p_G00ID];
           const dfloat Grs = ggeo[gid + p_G01ID];
           const dfloat Gss = ggeo[gid + p_G11ID];
-          const dfloat J   = ggeo[gid + p_GWJID];
+          const dfloat J   = wJ[e];
 
           dfloat qrr = 0.;
           dfloat qrs = 0.;
@@ -97,6 +97,7 @@
 @kernel void ellipticPartialAxTri3D(const dlong Nelements,
                                     @restrict const  dlong   *  elementList,
                                     @restrict const  dlong   *  GlobalToLocal,
+                                    @restrict const  dfloat *  wJ,
                                     @restrict const  dfloat *  ggeo,
                                     @restrict const  dfloat *  Dmatrices,
                                     @restrict const  dfloat *  Smatrices,
@@ -121,7 +122,6 @@
       }
     }
 
-    @barrier("local");
 
 
     for(dlong e=eo;e<eo+p_NblockV;++e;@inner(1)){
@@ -134,7 +134,7 @@
           const dfloat Grr = ggeo[gid + p_G00ID];
           const dfloat Grs = ggeo[gid + p_G01ID];
           const dfloat Gss = ggeo[gid + p_G11ID];
-          const dfloat J   = ggeo[gid + p_GWJID];
+          const dfloat J   = wJ[element];
 
           dfloat qrr = 0.;
           dfloat qrs = 0.;
diff --git a/solvers/elliptic/okl/ellipticCubatureAxHex3D.okl b/solvers/elliptic/okl/ellipticCubatureAxHex3D.okl
index e5d34b67c..451e15d0f 100644
--- a/solvers/elliptic/okl/ellipticCubatureAxHex3D.okl
+++ b/solvers/elliptic/okl/ellipticCubatureAxHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Kasia Swirydowicz, Noel Chalmers, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Kasia Swirydowicz, Noel Chalmers, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -46,13 +46,13 @@ SOFTWARE.
 #define p_gllPad 0
 
 @kernel void ellipticCubaturePartialAxHex3D_v0(const dlong Nelements,
-					       @restrict const dlong  * elementList,
-					       @restrict const dfloat * cubggeo,
-					       @restrict const dfloat * cubD,
-					       @restrict const dfloat * cubInterpT,
-					       const dfloat lambda,
-					       @restrict const dfloat * q,
-					       @restrict       dfloat * Aq){
+                                               @restrict const dlong  * elementList,
+                                               @restrict const dfloat * cubggeo,
+                                               @restrict const dfloat * cubD,
+                                               @restrict const dfloat * cubInterpT,
+                                               const dfloat lambda,
+                                               @restrict const dfloat * q,
+                                               @restrict       dfloat * Aq){
 
   for(int e=0; e<Nelements; ++e; @outer(0)) {
 
@@ -63,25 +63,25 @@ SOFTWARE.
 
     @exclusive dfloat r_q[p_cubNq];
     @exclusive dfloat r_qs, r_qt;
-    
+
     @exclusive dlong r_element;
-    
+
     for(int b=0;b<p_cubNq;++b;@inner(1)){
       for(int a=0;a<p_cubNq;++a;@inner(0)){
 
-	r_element = elementList[e];
+        r_element = elementList[e];
 
-	int id = a + b*p_cubNq;
+        int id = a + b*p_cubNq;
         if(id<p_cubNq*p_Nq){
           s_I[a][b] = cubInterpT[id];
         }
 
         s_cubD[b][a] = cubD[id];
 
-	//	if(e==0 && a==0 && b==0)
-	//	  printf("cD: ");
-	//	if(e==0)
-	//	  printf("s_cubD[%d][%d]=%lf\n ", b, a, s_cubD[b][a]);
+        //      if(e==0 && a==0 && b==0)
+        //        printf("cD: ");
+        //      if(e==0)
+        //        printf("s_cubD[%d][%d]=%lf\n ", b, a, s_cubD[b][a]);
 
         if(a<p_Nq && b<p_Nq){
 
@@ -108,7 +108,7 @@ SOFTWARE.
 #pragma unroll p_halfC
             for(int j=0;j<p_halfC;++j){
 
-              dfloat tmp = 0; 
+              dfloat tmp = 0;
               dfloat tmp2 = 0;
 
               #pragma unroll p_Nq
@@ -135,7 +135,7 @@ SOFTWARE.
         if(c<p_Nq){
 
           #pragma unroll p_Nq
-	  for(int a=0;a<p_Nq;++a)
+          for(int a=0;a<p_Nq;++a)
               r_q[a] = s_q[c][j][a];
 
           #pragma unroll p_halfC
@@ -152,7 +152,7 @@ SOFTWARE.
                   tmp2 += sIia*r_q[p_Nq-1-a];
                 }
 
-              s_q[c][j][i] = tmp; 
+              s_q[c][j][i] = tmp;
               s_q[c][j][p_cubNq-1-i] = tmp2;
             }
         }
@@ -191,9 +191,9 @@ SOFTWARE.
     for(int j=0;j<p_cubNq;++j;@inner(1)){
       for(int i=0;i<p_cubNq;++i;@inner(0)){
 #pragma unroll p_cubNq
-	for(int k=0; k<p_cubNq; ++k){
-	  r_q[k] =0.0f;
-	}
+        for(int k=0; k<p_cubNq; ++k){
+          r_q[k] =0.0f;
+        }
       }
     }
 
@@ -207,7 +207,7 @@ SOFTWARE.
 
             //geofactors for k j i thread
             const dfloat r_GwJ = cubggeo[base+p_GWJID*p_cubNp];
-	    
+
             const dfloat r_G00 = cubggeo[base+p_G00ID*p_cubNp];
             const dfloat r_G01 = cubggeo[base+p_G01ID*p_cubNp];
             const dfloat r_G02 = cubggeo[base+p_G02ID*p_cubNp];
@@ -216,8 +216,8 @@ SOFTWARE.
             const dfloat r_G12 = cubggeo[base+p_G12ID*p_cubNp];
             const dfloat r_G22 = cubggeo[base+p_G22ID*p_cubNp];
 
-	    //	    if(e==0) printf("r_G=[%g,%g,%g; %g,%g; %g]\n",
-	    //			    r_G00, r_G01, r_G02, r_G11, r_G12, r_G22);
+            //      if(e==0) printf("r_G=[%g,%g,%g; %g,%g; %g]\n",
+            //                      r_G00, r_G01, r_G02, r_G11, r_G12, r_G22);
 
             // now, put together dq/dr, qq/ds, dq/dt and dq/dI
 
@@ -277,7 +277,7 @@ SOFTWARE.
           }
         }
       }//k
-  
+
     //Loop 7
     for(int j=0;j<p_cubNq;++j;@inner(1)){
       for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -315,7 +315,7 @@ SOFTWARE.
                 tmp2 += sIjb*r_q[p_cubNq-1-j];
               }
 
-            s_q[k][b][i] = tmp; 
+            s_q[k][b][i] = tmp;
             s_q[k][p_Nq-b-1][i] = tmp2;
           }
       }
@@ -386,17 +386,17 @@ SOFTWARE.
 
 
 @kernel void ellipticCubaturePartialAxHex3D(const dlong Nelements,
-					    @restrict const dlong  * elementList,
-					    @restrict const dfloat * cubggeo,
-					    @restrict const dfloat * cubD,
-					    @restrict const dfloat * cubInterpT,
-					    const dfloat lambda,
-					    @restrict const dfloat * q,
-					    @restrict       dfloat * Aq){
-
-  
+                                            @restrict const dlong  * elementList,
+                                            @restrict const dfloat * cubggeo,
+                                            @restrict const dfloat * cubD,
+                                            @restrict const dfloat * cubInterpT,
+                                            const dfloat lambda,
+                                            @restrict const dfloat * q,
+                                            @restrict       dfloat * Aq){
+
+
   for(int e=0; e<Nelements; ++e; @outer(0)) {
-    
+
     @shared volatile dfloat s_q[p_cubNq][p_cubNq][p_cubNq];
 
     @shared dfloat s_cubD[p_cubNq][p_cubNq];
@@ -407,15 +407,15 @@ SOFTWARE.
     @shared dfloat s_I[p_cubNq][p_Nq];
 
     @exclusive dfloat r_q[p_cubNq];
-    
+
     @exclusive dlong r_element;
-    
+
     for(int b=0;b<p_cubNq;++b;@inner(1)){
       for(int a=0;a<p_cubNq;++a;@inner(0)){
 
-	r_element = elementList[e];
+        r_element = elementList[e];
 
-	int id = a + b*p_cubNq;
+        int id = a + b*p_cubNq;
         if(id<p_cubNq*p_Nq){
           s_I[a][b] = cubInterpT[id];
         }
@@ -424,14 +424,14 @@ SOFTWARE.
 
         if(a<p_Nq && b<p_Nq){
 
-	  for(int c=0;c<p_Nq;++c){
-	    dlong id = r_element*p_Np + c*p_Nq*p_Nq + b*p_Nq + a;
-	    s_q[c][b][a] = q[id];
-	  }
+          for(int c=0;c<p_Nq;++c){
+            dlong id = r_element*p_Np + c*p_Nq*p_Nq + b*p_Nq + a;
+            s_q[c][b][a] = q[id];
+          }
         }
       }
     }
-    
+
     // ============== interpolate in 3 dir ========================
     // 2. interpolate in b
     for(int c=0;c<p_cubNq;++c;@inner(1)){
@@ -439,28 +439,27 @@ SOFTWARE.
 
         if(a<p_Nq && c<p_Nq){
 
-	  // fetch to registers
-	  for(int b=0;b<p_Nq;++b)
-	    r_q[b] = s_q[c][b][a];
-
-	  // mat-vec Ijb*r_q[b]
-	  for(int j=0;j<p_cubNq;++j){
-
-	    dfloat tmp = 0; 
-	    for(int b=0;b<p_Nq;++b){
-		
-	      const dfloat sIjb= s_I[j][b];
-	      
-	      tmp  += sIjb*r_q[b];
-	    }
-	    // store to s_q[c][j][a], ok since only this thread walks [c][:][a]
-	    s_q[c][j][a] = tmp;
-	  }
+          // fetch to registers
+          for(int b=0;b<p_Nq;++b)
+            r_q[b] = s_q[c][b][a];
+
+          // mat-vec Ijb*r_q[b]
+          for(int j=0;j<p_cubNq;++j){
+
+            dfloat tmp = 0;
+            for(int b=0;b<p_Nq;++b){
+
+              const dfloat sIjb= s_I[j][b];
+
+              tmp  += sIjb*r_q[b];
+            }
+            // store to s_q[c][j][a], ok since only this thread walks [c][:][a]
+            s_q[c][j][a] = tmp;
+          }
         }
       }
     }//for c
 
-    @barrier("local");
 
     // 3. transform in a
     for(int c=0;c<p_cubNq;++c;@inner(1)){
@@ -468,46 +467,45 @@ SOFTWARE.
 
         if(c<p_Nq){
 
-	  for(int a=0;a<p_Nq;++a)
-	    r_q[a] = s_q[c][j][a];
-
-	  for(int i=0;i<p_cubNq;++i){
-	    
-	    dfloat tmp = 0;
-	    
-	    for(int a=0;a<p_Nq;++a){
-	      
-	      const dfloat sIia = s_I[i][a];
-	      tmp  += sIia*r_q[a];
-	    }
-	    
-	    s_q[c][j][i] = tmp; 
-	  }
+          for(int a=0;a<p_Nq;++a)
+            r_q[a] = s_q[c][j][a];
+
+          for(int i=0;i<p_cubNq;++i){
+
+            dfloat tmp = 0;
+
+            for(int a=0;a<p_Nq;++a){
+
+              const dfloat sIia = s_I[i][a];
+              tmp  += sIia*r_q[a];
+            }
+
+            s_q[c][j][i] = tmp;
+          }
         }
       }
     }
 
-    @barrier("local");
 
     // 4. transform in c
     for(int j=0;j<p_cubNq;++j;@inner(1)){
       for(int i=0;i<p_cubNq;++i;@inner(0)){
 
-	for(int c=0;c<p_Nq;++c)
-	  r_q[c] = s_q[c][j][i];
-
-	for(int k=0;k<p_cubNq;++k){
-
-	  dfloat tmp = 0;
-	    
-	  for(int c=0;c<p_Nq;++c){
-	      
-	    const dfloat sIkc = s_I[k][c];
-	    tmp  += sIkc*r_q[c];
-	  }
-	    
-	  s_q[k][j][i] = tmp; // ok since only this thread
-	}
+        for(int c=0;c<p_Nq;++c)
+          r_q[c] = s_q[c][j][i];
+
+        for(int k=0;k<p_cubNq;++k){
+
+          dfloat tmp = 0;
+
+          for(int c=0;c<p_Nq;++c){
+
+            const dfloat sIkc = s_I[k][c];
+            tmp  += sIkc*r_q[c];
+          }
+
+          s_q[k][j][i] = tmp; // ok since only this thread
+        }
       }
     }
 
@@ -516,173 +514,169 @@ SOFTWARE.
 
     for(int j=0;j<p_cubNq;++j;@inner(1)){
       for(int i=0;i<p_cubNq;++i;@inner(0)){
-	// use r_q to accumulate Aq
-	for(int k=0; k<p_cubNq; ++k){
-	  r_q[k] =0.0f;
-	}
+        // use r_q to accumulate Aq
+        for(int k=0; k<p_cubNq; ++k){
+          r_q[k] =0.0f;
+        }
       }
     }
-  
+
 #pragma unroll p_cubNq
     for(int k=0; k<p_cubNq; ++k) {
 
-      @barrier("local");
-      
+
       for(int j=0; j<p_cubNq; ++j; @inner(1)) {
-	for(int i=0; i<p_cubNq; ++i; @inner(0)) {
-	  
-	  const int base = r_element*p_Nggeo*p_cubNp + k*p_cubNq*p_cubNq + j*p_cubNq + i;
+        for(int i=0; i<p_cubNq; ++i; @inner(0)) {
+
+          const int base = r_element*p_Nggeo*p_cubNp + k*p_cubNq*p_cubNq + j*p_cubNq + i;
 
-	  //geofactors for k j i thread
-	  const dfloat r_GwJ = cubggeo[base+p_GWJID*p_cubNp];
-	    
-	  const dfloat r_G00 = cubggeo[base+p_G00ID*p_cubNp];
-	  const dfloat r_G01 = cubggeo[base+p_G01ID*p_cubNp];
-	  const dfloat r_G02 = cubggeo[base+p_G02ID*p_cubNp];
+          //geofactors for k j i thread
+          const dfloat r_GwJ = cubggeo[base+p_GWJID*p_cubNp];
 
-	  const dfloat r_G11 = cubggeo[base+p_G11ID*p_cubNp];
-	  const dfloat r_G12 = cubggeo[base+p_G12ID*p_cubNp];
+          const dfloat r_G00 = cubggeo[base+p_G00ID*p_cubNp];
+          const dfloat r_G01 = cubggeo[base+p_G01ID*p_cubNp];
+          const dfloat r_G02 = cubggeo[base+p_G02ID*p_cubNp];
 
-	  const dfloat r_G22 = cubggeo[base+p_G22ID*p_cubNp];
+          const dfloat r_G11 = cubggeo[base+p_G11ID*p_cubNp];
+          const dfloat r_G12 = cubggeo[base+p_G12ID*p_cubNp];
 
-	  // 'r', 's', 't' derivatives
-	  dfloat dr = 0.0f;
-	  dfloat ds = 0.0f;
-	  dfloat dt = 0.0f;
+          const dfloat r_G22 = cubggeo[base+p_G22ID*p_cubNp];
+
+          // 'r', 's', 't' derivatives
+          dfloat dr = 0.0f;
+          dfloat ds = 0.0f;
+          dfloat dt = 0.0f;
 
 #pragma unroll p_cubNq
-	  for (int n = 0; n<p_cubNq; ++n) {
-	    dr += s_cubD[i][n]*s_q[k][j][n];
-	    ds += s_cubD[j][n]*s_q[k][n][i];
-	    dt += s_cubD[k][n]*s_q[n][j][i];
-	  }
+          for (int n = 0; n<p_cubNq; ++n) {
+            dr += s_cubD[i][n]*s_q[k][j][n];
+            ds += s_cubD[j][n]*s_q[k][n][i];
+            dt += s_cubD[k][n]*s_q[n][j][i];
+          }
 
-	  s_qr[j][i] = r_G00*dr + r_G01*ds + r_G02*dt;
-	  s_qs[j][i] = r_G01*dr + r_G11*ds + r_G12*dt;
+          s_qr[j][i] = r_G00*dr + r_G01*ds + r_G02*dt;
+          s_qs[j][i] = r_G01*dr + r_G11*ds + r_G12*dt;
 
-	  const dfloat r_qt = r_G02*dr + r_G12*ds + r_G22*dt;
+          const dfloat r_qt = r_G02*dr + r_G12*ds + r_G22*dt;
 
-	  for(int n=0;n<p_cubNq;++n){
-	    r_q[n] += s_cubD[k][n]*r_qt;
-	  }
+          for(int n=0;n<p_cubNq;++n){
+            r_q[n] += s_cubD[k][n]*r_qt;
+          }
 
-	  r_q[k] += lambda*r_GwJ*s_q[k][j][i];
-	}
+          r_q[k] += lambda*r_GwJ*s_q[k][j][i];
+        }
       }
-      
-      @barrier("local");
-      
+
+
       // weak diff
       for(int j=0;j<p_cubNq;++j;@inner(1)){
-	for(int i=0;i<p_cubNq;++i;@inner(0)){
-	    
-	  dfloat lapqr = 0.0f, lapqs = 0.0f;
-	    
+        for(int i=0;i<p_cubNq;++i;@inner(0)){
+
+          dfloat lapqr = 0.0f, lapqs = 0.0f;
+
 #pragma unroll p_cubNq
-	  for(int n=0;n<p_cubNq;++n){
-	    lapqr += s_cubD[n][i]*s_qr[j][n];
-	    lapqs += s_cubD[n][j]*s_qs[n][i];
-	  }
-	      
-	  r_q[k] += lapqr+lapqs;
-	}
+          for(int n=0;n<p_cubNq;++n){
+            lapqr += s_cubD[n][i]*s_qr[j][n];
+            lapqs += s_cubD[n][j]*s_qs[n][i];
+          }
+
+          r_q[k] += lapqr+lapqs;
+        }
       }
     }//k
-  
-    @barrier("local");
-    
+
+
     // share r_q[:]
     for(int j=0;j<p_cubNq;++j;@inner(1)){
       for(int i=0;i<p_cubNq;++i;@inner(0)){
-	  
+
 #pragma unroll p_cubNq
-	for(int k=0; k<p_cubNq; ++k) {
-	  s_q[k][j][i] = r_q[k];
-	}
+        for(int k=0; k<p_cubNq; ++k) {
+          s_q[k][j][i] = r_q[k];
+        }
       }
     }
-  
+
     //=========== now project =================================================
     // b -> c -> a
 
-    @barrier("local");
 
-  
+
     // test in b
     for(int k=0;k<p_cubNq;++k;@inner(1)){
       for(int i=0;i<p_cubNq;++i;@inner(0)){
-	  
+
 #pragma unroll p_cubNq
-	for(int j=0;j<p_cubNq;++j){
-	  r_q[j] = s_q[k][j][i];
-	}
-	  
-	for(int b=0;b<p_Nq;++b){
-	    
-	  dfloat tmp = 0.0f;
-	    
+        for(int j=0;j<p_cubNq;++j){
+          r_q[j] = s_q[k][j][i];
+        }
+
+        for(int b=0;b<p_Nq;++b){
+
+          dfloat tmp = 0.0f;
+
 #pragma unroll p_cubNq
-	  for(int j=0;j<p_cubNq;++j){
-
-	    const dfloat sIjb = s_I[j][b];
-	    tmp  += sIjb*r_q[j];
-	  }
-	    
-	  s_q[k][b][i] = tmp; 
-	}
+          for(int j=0;j<p_cubNq;++j){
+
+            const dfloat sIjb = s_I[j][b];
+            tmp  += sIjb*r_q[j];
+          }
+
+          s_q[k][b][i] = tmp;
+        }
       }
     }
-      
+
     // transform back in a
-      
+
     for(int k=0;k<p_cubNq;++k;@inner(1)){
       for(int b=0;b<p_cubNq;++b;@inner(0)){
-	if(b<p_Nq){
+        if(b<p_Nq){
 #pragma unroll p_cubNq
-	  for(int i=0;i<p_cubNq;++i)
-	    r_q[i] = s_q[k][b][i];
-	    
-	  for(int a=0;a<p_Nq;++a){
-	      
-	    dfloat tmp  = 0.0f;
-	      
+          for(int i=0;i<p_cubNq;++i)
+            r_q[i] = s_q[k][b][i];
+
+          for(int a=0;a<p_Nq;++a){
+
+            dfloat tmp  = 0.0f;
+
 #pragma unroll p_cubNq
-	    for(int i=0;i<p_cubNq;++i){
-		
-	      const dfloat sIia = s_I[i][a];
-	      tmp  += sIia*r_q[i];
-	    }
-	      
-	    s_q[k][b][a] =tmp;
-	  }
-	}
+            for(int i=0;i<p_cubNq;++i){
+
+              const dfloat sIia = s_I[i][a];
+              tmp  += sIia*r_q[i];
+            }
+
+            s_q[k][b][a] =tmp;
+          }
+        }
       }
     }
-      
+
     // transform back in c
     for(int b=0;b<p_cubNq;++b;@inner(1)){
       for(int a=0;a<p_cubNq;++a;@inner(0)){
-	if(a<p_Nq && b<p_Nq){
-	    
+        if(a<p_Nq && b<p_Nq){
+
 #pragma unroll p_cubNq
-	  for(int k=0;k<p_cubNq;++k){
-	    r_q[k] = s_q[k][b][a];
-	  }
+          for(int k=0;k<p_cubNq;++k){
+            r_q[k] = s_q[k][b][a];
+          }
 
-	  for(int c=0;c<p_Nq;++c){
+          for(int c=0;c<p_Nq;++c){
 
-	    dfloat tmp  = 0.0f;
+            dfloat tmp  = 0.0f;
 
 #pragma unroll p_cubNq
-	    for(int k=0;k<p_cubNq;++k){
+            for(int k=0;k<p_cubNq;++k){
 
-	      const dfloat sIkc = s_I[k][c];
-	      tmp  += sIkc*r_q[k];
-	    }
+              const dfloat sIkc = s_I[k][c];
+              tmp  += sIkc*r_q[k];
+            }
 
-	    dlong id = r_element*p_Np + c*p_Nq*p_Nq + b*p_Nq + a;
-	    Aq[id] = tmp;
-	  }//c
+            dlong id = r_element*p_Np + c*p_Nq*p_Nq + b*p_Nq + a;
+            Aq[id] = tmp;
+          }//c
         }//if
       }//a
     }//b
diff --git a/solvers/elliptic/okl/ellipticGradientHex3D.okl b/solvers/elliptic/okl/ellipticGradientHex3D.okl
index fa22d3cc6..9b7016d94 100644
--- a/solvers/elliptic/okl/ellipticGradientHex3D.okl
+++ b/solvers/elliptic/okl/ellipticGradientHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -54,7 +54,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // loop over slabs
     for(int k=0;k<p_Nq;++k){
@@ -127,7 +126,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // loop over slabs
     for(int k=0;k<p_Nq;++k){
diff --git a/solvers/elliptic/okl/ellipticGradientQuad2D.okl b/solvers/elliptic/okl/ellipticGradientQuad2D.okl
index b71f9bdbb..7de75345e 100644
--- a/solvers/elliptic/okl/ellipticGradientQuad2D.okl
+++ b/solvers/elliptic/okl/ellipticGradientQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -50,7 +50,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -107,7 +106,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
diff --git a/solvers/elliptic/okl/ellipticGradientQuad3D.okl b/solvers/elliptic/okl/ellipticGradientQuad3D.okl
index 9b9f4de12..00dfa2f9f 100644
--- a/solvers/elliptic/okl/ellipticGradientQuad3D.okl
+++ b/solvers/elliptic/okl/ellipticGradientQuad3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -50,7 +50,6 @@ SOFTWARE.
       }
     }
     
-    @barrier("local");
     
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -120,7 +119,6 @@ SOFTWARE.
       }
     }
     
-    @barrier("local");
     
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
diff --git a/solvers/elliptic/okl/ellipticGradientTet3D.okl b/solvers/elliptic/okl/ellipticGradientTet3D.okl
index 9cff4bceb..6f3a61bd3 100644
--- a/solvers/elliptic/okl/ellipticGradientTet3D.okl
+++ b/solvers/elliptic/okl/ellipticGradientTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -46,7 +46,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(dlong e=eo;e<eo+p_NblockV;++e;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
@@ -110,7 +109,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(dlong e=eo;e<eo+p_NblockV;++e;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
diff --git a/solvers/elliptic/okl/ellipticGradientTri2D.okl b/solvers/elliptic/okl/ellipticGradientTri2D.okl
index b09f69d40..1dfbc7589 100644
--- a/solvers/elliptic/okl/ellipticGradientTri2D.okl
+++ b/solvers/elliptic/okl/ellipticGradientTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -43,7 +43,6 @@ SOFTWARE.
       s_q[n] = q[id];
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Np;++n;@inner(0)){
 
@@ -107,7 +106,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int e=eo;e<eo+p_NblockV;++e;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
@@ -219,7 +217,6 @@ SOFTWARE.
         }
       }
 
-    @barrier("local");
 
     for(int es=0;es<p_NbV; ++es; @inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
diff --git a/solvers/elliptic/okl/ellipticGradientTri3D.okl b/solvers/elliptic/okl/ellipticGradientTri3D.okl
index 7cd20cc74..2bf8c45c0 100644
--- a/solvers/elliptic/okl/ellipticGradientTri3D.okl
+++ b/solvers/elliptic/okl/ellipticGradientTri3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -62,7 +62,6 @@ SOFTWARE.
       }
     }
     
-    @barrier("local");
 
     for(int e=eo;e<eo+p_NblockV;++e;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
@@ -81,14 +80,14 @@ SOFTWARE.
               qr += drt*qn;
               qs += dst*qn;
             }
-          
+
           dfloat4 gradqn;
           gradqn.x = drdx*qr + dsdx*qs;
           gradqn.y = drdy*qr + dsdy*qs;
-	  gradqn.z = drdz*qr + dsdz*qs;
+          gradqn.z = drdz*qr + dsdz*qs;
           gradqn.w = s_q[es][n];
-          
-          const int id = e*p_Np+n; 
+
+          const int id = e*p_Np+n;
           gradq[id] = gradqn;
         }
       }
@@ -154,10 +153,10 @@ SOFTWARE.
 @kernel void ellipticPartialGradientTri3D(const dlong Nelements,
          const dlong offset,
          @restrict const  dfloat *  vgeo,
-         @restrict const  dfloat *  Dmatrices, 
+         @restrict const  dfloat *  Dmatrices,
          @restrict const  dfloat *  q,
               @restrict dfloat4 *  gradq){
-  
+
   for(dlong eo=0;eo<Nelements;eo+=(p_NbV*p_Nmt);@outer(0)){
 
     @shared dfloat s_q[p_Nmt][p_NbV][p_Np];
@@ -165,17 +164,16 @@ SOFTWARE.
     for(int es=0; es<p_NbV;++es;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
         #pragma unroll p_Nmt
-        for(int em=0;em<p_Nmt;++em){    
-          const dlong e = eo+es*p_Nmt + em; 
+        for(int em=0;em<p_Nmt;++em){
+          const dlong e = eo+es*p_Nmt + em;
             if(e<Nelements){
               const dlong id = n + (e+offset)*p_Np;
               s_q[em][es][n] = q[id];
             }
-          }  
+          }
         }
       }
 
-    @barrier("local");
 
     for(int es=0;es<p_NbV; ++es; @inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
@@ -184,20 +182,20 @@ SOFTWARE.
         // hold geometric factors on register
         dfloat drdx2[p_Nmt], dsdx2[p_Nmt];
         dfloat drdy2[p_Nmt], dsdy2[p_Nmt];
-	dfloat drdz2[p_Nmt], dsdz2[p_Nmt];
-        
+        dfloat drdz2[p_Nmt], dsdz2[p_Nmt];
+
         #pragma unroll p_Nmt
-        for(int em=0;em<p_Nmt;++em){ 
-          const dlong e = eo+es*p_Nmt + em+offset; 
+        for(int em=0;em<p_Nmt;++em){
+          const dlong e = eo+es*p_Nmt + em+offset;
           qr[em] = 0.f;
           qs[em] = 0.f;
           //
           drdx2[em] = vgeo[p_Nvgeo*e+p_RXID];
           drdy2[em] = vgeo[p_Nvgeo*e+p_RYID];
-	  drdz2[em] = vgeo[p_Nvgeo*e+p_RZID];
+          drdz2[em] = vgeo[p_Nvgeo*e+p_RZID];
           dsdx2[em] = vgeo[p_Nvgeo*e+p_SXID];
           dsdy2[em] = vgeo[p_Nvgeo*e+p_SYID];
-	  dsdz2[em] = vgeo[p_Nvgeo*e+p_SZID];
+          dsdz2[em] = vgeo[p_Nvgeo*e+p_SZID];
         }
 
         #pragma unroll p_Np
@@ -206,7 +204,7 @@ SOFTWARE.
           dfloat dst = Dmatrices[n + i*p_Np +1*p_Np*p_Np];
 
           #pragma unroll p_Nmt
-          for(int em=0;em<p_Nmt;++em){ 
+          for(int em=0;em<p_Nmt;++em){
             dfloat qn = s_q[em][es][i];
 
             qr[em] += drt*qn;
@@ -217,13 +215,13 @@ SOFTWARE.
         dfloat4 gradqn;
 
         #pragma unroll p_Nmt
-        for(int em=0;em<p_Nmt;++em){ 
-         const dlong e  = eo+es*p_Nmt + em; 
-         if(e<Nelements){ 
-           const dlong id = (e+offset)*p_Np+n; 
+        for(int em=0;em<p_Nmt;++em){
+         const dlong e  = eo+es*p_Nmt + em;
+         if(e<Nelements){
+           const dlong id = (e+offset)*p_Np+n;
            gradqn.x = drdx2[em]*qr[em] + dsdx2[em]*qs[em];
            gradqn.y = drdy2[em]*qr[em] + dsdy2[em]*qs[em];
-	   gradqn.z = drdz2[em]*qr[em] + dsdz2[em]*qs[em];
+           gradqn.z = drdz2[em]*qr[em] + dsdz2[em]*qs[em];
            gradqn.w = s_q[em][es][n];
 
            gradq[id] = gradqn;
diff --git a/solvers/elliptic/okl/ellipticMask.okl b/solvers/elliptic/okl/ellipticMask.okl
index 08113be84..21263292f 100644
--- a/solvers/elliptic/okl/ellipticMask.okl
+++ b/solvers/elliptic/okl/ellipticMask.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/elliptic/okl/ellipticPatchSolver.okl b/solvers/elliptic/okl/ellipticPatchSolver.okl
index 54bc55821..3323b9593 100644
--- a/solvers/elliptic/okl/ellipticPatchSolver.okl
+++ b/solvers/elliptic/okl/ellipticPatchSolver.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -47,7 +47,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // loop over elements/nodes in patch and do local smooth
     for(dlong e=eo;e<eo+p_NblockV;++e;@inner(1)){
diff --git a/solvers/elliptic/okl/ellipticPreconBlockJacobi.okl b/solvers/elliptic/okl/ellipticPreconBlockJacobi.okl
index 8b0006cf3..be50adb52 100644
--- a/solvers/elliptic/okl/ellipticPreconBlockJacobi.okl
+++ b/solvers/elliptic/okl/ellipticPreconBlockJacobi.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -56,7 +56,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
@@ -111,7 +110,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
@@ -131,4 +129,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/elliptic/okl/ellipticPreconCoarsenHex3D.okl b/solvers/elliptic/okl/ellipticPreconCoarsenHex3D.okl
index 88fad4139..66eff1130 100644
--- a/solvers/elliptic/okl/ellipticPreconCoarsenHex3D.okl
+++ b/solvers/elliptic/okl/ellipticPreconCoarsenHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -51,7 +51,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // coarsen in k index
     for(int k=0;k<p_NqFine;++k;@inner(2)){
@@ -69,7 +68,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // coarsen in j index
     for(int k=0;k<p_NqFine;++k;@inner(2)){
@@ -87,7 +85,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // coarsen in i index
     for(int k=0;k<p_NqFine;++k;@inner(2)){
@@ -134,7 +131,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_NqFine;++j;@inner(1)){
       for(int i=0;i<p_NqFine;++i;@inner(0)){
@@ -157,7 +153,6 @@ SOFTWARE.
 
     for(int k=0;k<p_NqCoarse;++k){
 
-      @barrier("local");
 
       for(int j=0;j<p_NqFine;++j;@inner(1)){
         for(int i=0;i<p_NqFine;++i;@inner(0)){
@@ -166,7 +161,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_NqFine;++j;@inner(1)){
         for(int i=0;i<p_NqFine;++i;@inner(0)){
@@ -185,7 +179,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_NqFine;++j;@inner(1)){
         for(int i=0;i<p_NqFine;++i;@inner(0)){
@@ -244,7 +237,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // coarsen in k index
     for(int k=0;k<p_NqFine;++k;@inner(2)){
@@ -262,7 +254,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // coarsen in j index
     for(int k=0;k<p_NqFine;++k;@inner(2)){
@@ -280,7 +271,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // coarsen in i index
     for(int k=0;k<p_NqFine;++k;@inner(2)){
@@ -301,4 +291,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/elliptic/okl/ellipticPreconCoarsenQuad2D.okl b/solvers/elliptic/okl/ellipticPreconCoarsenQuad2D.okl
index baf1c9406..b247b1fa2 100644
--- a/solvers/elliptic/okl/ellipticPreconCoarsenQuad2D.okl
+++ b/solvers/elliptic/okl/ellipticPreconCoarsenQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -48,7 +48,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // coarsen in j index
     for(int j=0;j<p_NqFine;++j;@inner(1)){
@@ -64,7 +63,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // coarsen in i index
     for(int j=0;j<p_NqFine;++j;@inner(1)){
@@ -112,7 +110,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // coarsen in j index
     for(int j=0;j<p_NqFine;++j;@inner(1)){
@@ -128,7 +125,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // coarsen in i index
     for(int j=0;j<p_NqFine;++j;@inner(1)){
@@ -147,4 +143,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/elliptic/okl/ellipticPreconCoarsenTet3D.okl b/solvers/elliptic/okl/ellipticPreconCoarsenTet3D.okl
index e5dc3f290..7c8bff8d1 100644
--- a/solvers/elliptic/okl/ellipticPreconCoarsenTet3D.okl
+++ b/solvers/elliptic/okl/ellipticPreconCoarsenTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -46,7 +46,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockVCoarse;++es;@inner(1)){
       for(int n=0;n<p_NpCoarse;++n;@inner(0)){
@@ -91,7 +90,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockVCoarse;++es;@inner(1)){
       for(int n=0;n<p_NpCoarse;++n;@inner(0)){
diff --git a/solvers/elliptic/okl/ellipticPreconCoarsenTri2D.okl b/solvers/elliptic/okl/ellipticPreconCoarsenTri2D.okl
index 5590a3108..e2522c663 100644
--- a/solvers/elliptic/okl/ellipticPreconCoarsenTri2D.okl
+++ b/solvers/elliptic/okl/ellipticPreconCoarsenTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -44,7 +44,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockVCoarse;++es;@inner(1)){
       for(int n=0;n<p_NpCoarse;++n;@inner(0)){
@@ -90,7 +89,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockVCoarse;++es;@inner(1)){
       for(int n=0;n<p_NpCoarse;++n;@inner(0)){
@@ -141,7 +139,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockVCoarse;++es;@inner(1)){
       for(int n=0;n<p_NpCoarse;++n;@inner(0)){
diff --git a/solvers/elliptic/okl/ellipticPreconProlongateHex3D.okl b/solvers/elliptic/okl/ellipticPreconProlongateHex3D.okl
index 3cc005311..aeb389247 100644
--- a/solvers/elliptic/okl/ellipticPreconProlongateHex3D.okl
+++ b/solvers/elliptic/okl/ellipticPreconProlongateHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -53,7 +53,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // prolongate in i index
     for(int k=0;k<p_NqFine;++k;@inner(2)){
@@ -71,7 +70,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // prolongate in j index
     for(int k=0;k<p_NqFine;++k;@inner(2)){
@@ -89,7 +87,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // coarsen in i index
     for(int k=0;k<p_NqFine;++k;@inner(2)){
@@ -132,7 +129,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_NqFine;++j;@inner(1)){
       for(int i=0;i<p_NqFine;++i;@inner(0)){
@@ -161,7 +157,6 @@ SOFTWARE.
 
     for(int k=0;k<p_NqFine;++k){
 
-      @barrier("local");
 
       for(int j=0;j<p_NqFine;++j;@inner(1)){
         for(int i=0;i<p_NqFine;++i;@inner(0)){
@@ -177,7 +172,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_NqFine;++j;@inner(1)){
         for(int i=0;i<p_NqFine;++i;@inner(0)){
@@ -201,7 +195,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_NqFine;++j;@inner(1)){
         for(int i=0;i<p_NqFine;++i;@inner(0)){
@@ -253,7 +246,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // prolongate in i index
     for(int k=0;k<p_NqFine;++k;@inner(2)){
@@ -271,7 +263,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // prolongate in j index
     for(int k=0;k<p_NqFine;++k;@inner(2)){
@@ -289,7 +280,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // coarsen in i index
     for(int k=0;k<p_NqFine;++k;@inner(2)){
@@ -308,4 +298,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/elliptic/okl/ellipticPreconProlongateQuad2D.okl b/solvers/elliptic/okl/ellipticPreconProlongateQuad2D.okl
index 6628cdb97..d597c48b3 100644
--- a/solvers/elliptic/okl/ellipticPreconProlongateQuad2D.okl
+++ b/solvers/elliptic/okl/ellipticPreconProlongateQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -49,7 +49,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // prolongate in i index
     for(int j=0;j<p_NqFine;++j;@inner(1)){
@@ -65,7 +64,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // coarsen in i index
     for(int j=0;j<p_NqFine;++j;@inner(1)){
@@ -112,7 +110,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // prolongate in i index
     for(int j=0;j<p_NqFine;++j;@inner(1)){
@@ -128,7 +125,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // coarsen in i index
     for(int j=0;j<p_NqFine;++j;@inner(1)){
@@ -145,4 +141,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/elliptic/okl/ellipticPreconProlongateTet3D.okl b/solvers/elliptic/okl/ellipticPreconProlongateTet3D.okl
index 17041c455..4d8a693c2 100644
--- a/solvers/elliptic/okl/ellipticPreconProlongateTet3D.okl
+++ b/solvers/elliptic/okl/ellipticPreconProlongateTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -45,7 +45,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockVFine;++es;@inner(1)){
       for(int n=0;n<p_NpFine;++n;@inner(0)){
@@ -91,7 +90,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockVFine;++es;@inner(1)){
       for(int n=0;n<p_NpFine;++n;@inner(0)){
diff --git a/solvers/elliptic/okl/ellipticPreconProlongateTri2D.okl b/solvers/elliptic/okl/ellipticPreconProlongateTri2D.okl
index ca323dca4..cfd9a4170 100644
--- a/solvers/elliptic/okl/ellipticPreconProlongateTri2D.okl
+++ b/solvers/elliptic/okl/ellipticPreconProlongateTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -44,7 +44,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockVFine;++es;@inner(1)){
       for(int n=0;n<p_NpFine;++n;@inner(0)){
@@ -90,7 +89,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockVFine;++es;@inner(1)){
       for(int n=0;n<p_NpFine;++n;@inner(0)){
@@ -139,7 +137,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockVFine;++es;@inner(1)){
       for(int n=0;n<p_NpFine;++n;@inner(0)){
diff --git a/solvers/elliptic/okl/ellipticRhsBCHex3D.okl b/solvers/elliptic/okl/ellipticRhsBCHex3D.okl
index 53146b029..fa9abc5e6 100644
--- a/solvers/elliptic/okl/ellipticRhsBCHex3D.okl
+++ b/solvers/elliptic/okl/ellipticRhsBCHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -58,6 +58,7 @@ void surfaceTerms(int sk,
 }
 
 @kernel void ellipticRhsBCHex3D(const dlong Nelements,
+                              @restrict const  dfloat *  wJ,
                               @restrict const  dfloat *  ggeo,
                               @restrict const  dfloat *  sgeo,
                               @restrict const  dfloat *  DT,
@@ -109,7 +110,6 @@ void surfaceTerms(int sk,
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -124,7 +124,6 @@ void surfaceTerms(int sk,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -140,7 +139,6 @@ void surfaceTerms(int sk,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -162,7 +160,6 @@ void surfaceTerms(int sk,
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -178,7 +175,6 @@ void surfaceTerms(int sk,
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -200,7 +196,6 @@ void surfaceTerms(int sk,
       }
     }
 
-    @barrier("local");
 
 
     // Layer by layer
@@ -220,11 +215,10 @@ void surfaceTerms(int sk,
             r_G12 = ggeo[gbase+p_G12ID*p_Np];
             r_G22 = ggeo[gbase+p_G22ID*p_Np];
 
-            r_GwJ = ggeo[gbase+p_GWJID*p_Np];
+            r_GwJ = wJ[e*p_Np + k*p_Nq*p_Nq + j*p_Nq + i];
           }
         }
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -241,7 +235,6 @@ void surfaceTerms(int sk,
           }
         }
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -265,7 +258,6 @@ void surfaceTerms(int sk,
           }
         }
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
diff --git a/solvers/elliptic/okl/ellipticRhsBCIpdgHex3D.okl b/solvers/elliptic/okl/ellipticRhsBCIpdgHex3D.okl
index dc951f132..a9857a821 100644
--- a/solvers/elliptic/okl/ellipticRhsBCIpdgHex3D.okl
+++ b/solvers/elliptic/okl/ellipticRhsBCIpdgHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -145,7 +145,6 @@ void surfaceTerms(int e,
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -163,7 +162,6 @@ void surfaceTerms(int e,
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -182,7 +180,6 @@ void surfaceTerms(int e,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -200,7 +197,6 @@ void surfaceTerms(int e,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -226,7 +222,6 @@ void surfaceTerms(int e,
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -245,7 +240,6 @@ void surfaceTerms(int e,
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -271,7 +265,6 @@ void surfaceTerms(int e,
       }
     }
 
-    @barrier("local");
 
     //layer by layer
     #pragma unroll p_Nq
@@ -303,7 +296,6 @@ void surfaceTerms(int e,
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -320,7 +312,6 @@ void surfaceTerms(int e,
         }
       }
 
-      @barrier("local");
     }
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
diff --git a/solvers/elliptic/okl/ellipticRhsBCIpdgQuad2D.okl b/solvers/elliptic/okl/ellipticRhsBCIpdgQuad2D.okl
index 4446244cd..e3905c984 100644
--- a/solvers/elliptic/okl/ellipticRhsBCIpdgQuad2D.okl
+++ b/solvers/elliptic/okl/ellipticRhsBCIpdgQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -95,7 +95,6 @@ void surfaceTerms(int e, int sk, int face, int i, int j,
       }
     }
 
-    @barrier("local");
 
     // loop over faces to add pseudo-gradient
 
@@ -113,7 +112,6 @@ void surfaceTerms(int e, int sk, int face, int i, int j,
                    s_dqdx, s_dqdy, s_rhsq);
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(0)){
@@ -130,7 +128,6 @@ void surfaceTerms(int e, int sk, int face, int i, int j,
 
     }
 
-    @barrier("local");
 
     // prescale by geofacs
     for(int j=0;j<p_Nq;++j){
@@ -152,7 +149,6 @@ void surfaceTerms(int e, int sk, int face, int i, int j,
       }
     }
 
-    @barrier("local");
 
     // loop over slabs
     for(int j=0;j<p_Nq;++j){
diff --git a/solvers/elliptic/okl/ellipticRhsBCIpdgTet3D.okl b/solvers/elliptic/okl/ellipticRhsBCIpdgTet3D.okl
index d8e096317..0d2e45870 100644
--- a/solvers/elliptic/okl/ellipticRhsBCIpdgTet3D.okl
+++ b/solvers/elliptic/okl/ellipticRhsBCIpdgTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -91,7 +91,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // dqdx += LIFT*(sJ/J)*nx*dq
     for(int n=0;n<p_Nmax;++n;@inner(0)){
@@ -133,7 +132,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Nmax;++n;@inner(0)){
       if(n<p_NfacesNfp){
@@ -156,7 +154,6 @@ SOFTWARE.
 
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Nmax;++n;@inner(0)){
       if(n<p_Np){
@@ -172,7 +169,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Nmax;++n;@inner(0)){
       if(n<p_Np){
diff --git a/solvers/elliptic/okl/ellipticRhsBCIpdgTri2D.okl b/solvers/elliptic/okl/ellipticRhsBCIpdgTri2D.okl
index 16f82a8fe..68c3ae227 100644
--- a/solvers/elliptic/okl/ellipticRhsBCIpdgTri2D.okl
+++ b/solvers/elliptic/okl/ellipticRhsBCIpdgTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -85,7 +85,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // dqdx += LIFT*(sJ/J)*nx*dq
     for(int n=0;n<p_Nmax;++n;@inner(0)){
@@ -116,7 +115,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Nmax;++n;@inner(0)){
       if(n<p_NfacesNfp){
@@ -138,7 +136,6 @@ SOFTWARE.
 
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Nmax;++n;@inner(0)){
       if(n<p_Np){
@@ -154,7 +151,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Nmax;++n;@inner(0)){
       if(n<p_Np){
diff --git a/solvers/elliptic/okl/ellipticRhsBCQuad2D.okl b/solvers/elliptic/okl/ellipticRhsBCQuad2D.okl
index c85c54727..a80c7e7c1 100644
--- a/solvers/elliptic/okl/ellipticRhsBCQuad2D.okl
+++ b/solvers/elliptic/okl/ellipticRhsBCQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,6 +25,7 @@ SOFTWARE.
 */
 
 @kernel void ellipticRhsBCQuad2D(const dlong Nelements,
+                              @restrict const  dfloat *  wJ,
                               @restrict const  dfloat *  ggeo,
                               @restrict const  dfloat *  sgeo,
                               @restrict const  dfloat *  DT,
@@ -57,7 +58,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 0 & 2
     for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -99,7 +99,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(0)){
@@ -141,15 +140,13 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // loop over slabs
     for(int j=0;j<p_Nq;++j){
       for(int i=0;i<p_Nq;++i;@inner(0)){
         const dlong base = e*p_Nggeo*p_Np + j*p_Nq + i;
 
-        // assumes w*J built into G entries
-        r_GwJ[j] = ggeo[base+p_GWJID*p_Np];
+        r_GwJ[j] = wJ[e*p_Np + j*p_Nq + i];
 
         r_G00[j] = ggeo[base+p_G00ID*p_Np];
         r_G01[j] = ggeo[base+p_G01ID*p_Np];
@@ -171,7 +168,6 @@ SOFTWARE.
     }
 
     // r term ----->
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -179,7 +175,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -194,7 +189,6 @@ SOFTWARE.
     }
 
     // s term ---->
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -202,7 +196,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j){
       for(int i=0;i<p_Nq;++i;@inner(0)){
diff --git a/solvers/elliptic/okl/ellipticRhsBCQuad3D.okl b/solvers/elliptic/okl/ellipticRhsBCQuad3D.okl
index a1c64ce56..3df529584 100644
--- a/solvers/elliptic/okl/ellipticRhsBCQuad3D.okl
+++ b/solvers/elliptic/okl/ellipticRhsBCQuad3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,6 +26,7 @@ SOFTWARE.
 
 // this is incomplete, needs to be fixed up for bcs in 3D
 @kernel void ellipticRhsBCQuad3D(const dlong Nelements,
+                              @restrict const  dfloat *  wJ,
                               @restrict const  dfloat *  ggeo,
                               @restrict const  dfloat *  sgeo,
                               @restrict const  dfloat *  DT,
@@ -58,7 +59,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 0 & 2
     for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -68,7 +68,7 @@ SOFTWARE.
 
         const dfloat nx = sgeo[sk*p_Nsgeo+p_NXID];
         const dfloat ny = sgeo[sk*p_Nsgeo+p_NYID];
-	const dfloat nz = sgeo[sk*p_Nsgeo+p_NZID];
+        const dfloat nz = sgeo[sk*p_Nsgeo+p_NZID];
         const dfloat WsJ = sgeo[sk*p_Nsgeo+p_WSJID];
 
         dfloat dudxP=0, dudyP=0, dudzP=0, uP=0;
@@ -87,7 +87,7 @@ SOFTWARE.
 
         const dfloat nx = sgeo[sk*p_Nsgeo+p_NXID];
         const dfloat ny = sgeo[sk*p_Nsgeo+p_NYID];
-	const dfloat nz = sgeo[sk*p_Nsgeo+p_NZID];
+        const dfloat nz = sgeo[sk*p_Nsgeo+p_NZID];
         const dfloat WsJ = sgeo[sk*p_Nsgeo+p_WSJID];
 
         dfloat dudxP=0, dudyP=0, dudzP=0, uP=0;
@@ -102,7 +102,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(0)){
@@ -112,7 +111,7 @@ SOFTWARE.
 
         const dfloat nx = sgeo[sk*p_Nsgeo+p_NXID];
         const dfloat ny = sgeo[sk*p_Nsgeo+p_NYID];
-	const dfloat nz = sgeo[sk*p_Nsgeo+p_NZID];
+        const dfloat nz = sgeo[sk*p_Nsgeo+p_NZID];
         const dfloat WsJ = sgeo[sk*p_Nsgeo+p_WSJID];
 
         dfloat dudxP=0, dudyP=0, dudzP=0, uP=0;
@@ -131,7 +130,7 @@ SOFTWARE.
 
         const dfloat nx = sgeo[sk*p_Nsgeo+p_NXID];
         const dfloat ny = sgeo[sk*p_Nsgeo+p_NYID];
-	const dfloat nz = sgeo[sk*p_Nsgeo+p_NZID];
+        const dfloat nz = sgeo[sk*p_Nsgeo+p_NZID];
         const dfloat WsJ = sgeo[sk*p_Nsgeo+p_WSJID];
 
         dfloat dudxP=0, dudyP=0, dudzP=0, uP=0;
@@ -146,17 +145,15 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // INCOMPLETENESS STARTS HERE
-    
+
     // loop over slabs
     for(int j=0;j<p_Nq;++j){
       for(int i=0;i<p_Nq;++i;@inner(0)){
         const dlong base = e*p_Nggeo*p_Np + j*p_Nq + i;
 
-        // assumes w*J built into G entries
-        r_GwJ[j] = ggeo[base+p_GWJID*p_Np];
+        r_GwJ[j] = wJ[e*p_Np + j*p_Nq + i];
 
         r_G00[j] = ggeo[base+p_G00ID*p_Np];
         r_G01[j] = ggeo[base+p_G01ID*p_Np];
@@ -178,7 +175,6 @@ SOFTWARE.
     }
 
     // r term ----->
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -186,7 +182,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -201,7 +196,6 @@ SOFTWARE.
     }
 
     // s term ---->
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -209,7 +203,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j){
       for(int i=0;i<p_Nq;++i;@inner(0)){
diff --git a/solvers/elliptic/okl/ellipticRhsBCTet3D.okl b/solvers/elliptic/okl/ellipticRhsBCTet3D.okl
index 08ad8f508..e63dd9b9f 100644
--- a/solvers/elliptic/okl/ellipticRhsBCTet3D.okl
+++ b/solvers/elliptic/okl/ellipticRhsBCTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,6 +25,7 @@ SOFTWARE.
 */
 
 @kernel void ellipticRhsBCTet3D(const int Nelements,
+                              @restrict const  dfloat *  wJ,
                               @restrict const  dfloat *  ggeo,
                               @restrict const  dfloat *  sgeo,
                               @restrict const  dfloat *  D,
@@ -52,7 +53,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Nmax;++n;@inner(0)){
       if(n<p_NfacesNfp){
@@ -83,7 +83,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Nmax;++n;@inner(0)){
       if(n<p_Np){
@@ -96,7 +95,7 @@ SOFTWARE.
         const dfloat Gss = ggeo[gid + p_G11ID];
         const dfloat Gst = ggeo[gid + p_G12ID];
         const dfloat Gtt = ggeo[gid + p_G22ID];
-        const dfloat J   = ggeo[gid + p_GWJID];
+        const dfloat J   = wJ[e];
 
         dfloat qrr = 0.;
         dfloat qrs = 0.;
diff --git a/solvers/elliptic/okl/ellipticRhsBCTri2D.okl b/solvers/elliptic/okl/ellipticRhsBCTri2D.okl
index 9158d6898..9e128ec84 100644
--- a/solvers/elliptic/okl/ellipticRhsBCTri2D.okl
+++ b/solvers/elliptic/okl/ellipticRhsBCTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,6 +25,7 @@ SOFTWARE.
 */
 
 @kernel void ellipticRhsBCTri2D(const dlong Nelements,
+                              @restrict const  dfloat *  wJ,
                               @restrict const  dfloat *  ggeo,
                               @restrict const  dfloat *  sgeo,
                               @restrict const  dfloat *  D,
@@ -52,7 +53,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Nmax;++n;@inner(0)){
       if(n<p_NfacesNfp){
@@ -82,7 +82,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Nmax;++n;@inner(0)){
       if(n<p_Np){
@@ -92,7 +91,7 @@ SOFTWARE.
         const dfloat Grr = ggeo[gid + p_G00ID];
         const dfloat Grs = ggeo[gid + p_G01ID];
         const dfloat Gss = ggeo[gid + p_G11ID];
-        const dfloat J   = ggeo[gid + p_GWJID];
+        const dfloat J   = wJ[e];
 
         dfloat qrr = 0.;
         dfloat qrs = 0.;
diff --git a/solvers/elliptic/okl/ellipticRhsHex3D.okl b/solvers/elliptic/okl/ellipticRhsHex3D.okl
index 97a26af61..3529315a6 100644
--- a/solvers/elliptic/okl/ellipticRhsHex3D.okl
+++ b/solvers/elliptic/okl/ellipticRhsHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,7 +26,7 @@ SOFTWARE.
 
 //spectral mass matrix
 @kernel void ellipticRhsHex3D(const dlong Nelements,
-                               @restrict const dfloat* ggeo,
+                               @restrict const dfloat* wJ,
                                @restrict const dfloat* MM,
                                @restrict const dfloat* x,
                                @restrict const dfloat* y,
@@ -39,9 +39,8 @@ SOFTWARE.
     for(int n=0;n<p_Np;++n;@inner(0)){
 
       // assumes w*J built into G entries
-      const dlong gbase = e*p_Nggeo*p_Np + n;
-      const dfloat r_GwJ = ggeo[gbase+p_GWJID*p_Np];
       const dlong id = e*p_Np + n;
+      const dfloat r_GwJ = wJ[id];
 
       dfloat f = 0;
       ellipticForcing3D(x[id], y[id], z[id], lambda, f);
diff --git a/solvers/elliptic/okl/ellipticRhsQuad2D.okl b/solvers/elliptic/okl/ellipticRhsQuad2D.okl
index 3fcde8791..4ed8f2077 100644
--- a/solvers/elliptic/okl/ellipticRhsQuad2D.okl
+++ b/solvers/elliptic/okl/ellipticRhsQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,7 +26,7 @@ SOFTWARE.
 
 //spectral mass matrix
 @kernel void ellipticRhsQuad2D(const dlong Nelements,
-                               @restrict const dfloat* ggeo,
+                               @restrict const dfloat* wJ,
                                @restrict const dfloat* MM,
                                @restrict const dfloat* x,
                                @restrict const dfloat* y,
@@ -39,9 +39,8 @@ SOFTWARE.
     for(int n=0;n<p_Np;++n;@inner(0)){
 
       // assumes w*J built into G entries
-      const dlong gbase = e*p_Nggeo*p_Np + n;
-      const dfloat r_GwJ = ggeo[gbase+p_GWJID*p_Np];
       const dlong id = e*p_Np + n;
+      const dfloat r_GwJ = wJ[id];
 
       dfloat f = 0;
       ellipticForcing2D(x[id], y[id], lambda, f);
diff --git a/solvers/elliptic/okl/ellipticRhsQuad3D.okl b/solvers/elliptic/okl/ellipticRhsQuad3D.okl
index 4f6436524..49243f647 100644
--- a/solvers/elliptic/okl/ellipticRhsQuad3D.okl
+++ b/solvers/elliptic/okl/ellipticRhsQuad3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,7 +26,7 @@ SOFTWARE.
 
 //spectral mass matrix
 @kernel void ellipticRhsQuad3D(const dlong Nelements,
-                               @restrict const dfloat* ggeo,
+                               @restrict const dfloat* wJ,
                                @restrict const dfloat* MM,
                                @restrict const dfloat* x,
                                @restrict const dfloat* y,
@@ -39,9 +39,8 @@ SOFTWARE.
     for(int n=0;n<p_Np;++n;@inner(0)){
 
       // assumes w*J built into G entries
-      const dlong gbase = e*p_Nggeo*p_Np + n;
-      const dfloat r_GwJ = ggeo[gbase+p_GWJID*p_Np];
       const dlong id = e*p_Np + n;
+      const dfloat r_GwJ = wJ[id];
 
       dfloat f = 0;
       ellipticForcing3D(x[id], y[id], z[id], lambda, f);
diff --git a/solvers/elliptic/okl/ellipticRhsTet3D.okl b/solvers/elliptic/okl/ellipticRhsTet3D.okl
index 849a3645d..f77547eb8 100644
--- a/solvers/elliptic/okl/ellipticRhsTet3D.okl
+++ b/solvers/elliptic/okl/ellipticRhsTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,7 +26,7 @@ SOFTWARE.
 
 
 @kernel void ellipticRhsTet3D(const dlong Nelements,
-                              @restrict const dfloat* ggeo,
+                              @restrict const dfloat* wJ,
                               @restrict const dfloat* MM,
                               @restrict const dfloat* x,
                               @restrict const dfloat* y,
@@ -47,11 +47,9 @@ SOFTWARE.
       s_q[n] = f;
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Np;++n;@inner(0)){
-      const dlong gid = e*p_Nggeo;
-      const dfloat J = ggeo[gid + p_GWJID];
+      const dfloat J = wJ[e];
 
       dfloat r_qM = 0.0;
 
diff --git a/solvers/elliptic/okl/ellipticRhsTri2D.okl b/solvers/elliptic/okl/ellipticRhsTri2D.okl
index 3cc4bbd7a..d5d89a253 100644
--- a/solvers/elliptic/okl/ellipticRhsTri2D.okl
+++ b/solvers/elliptic/okl/ellipticRhsTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,7 +26,7 @@ SOFTWARE.
 
 
 @kernel void ellipticRhsTri2D(const dlong Nelements,
-                              @restrict const dfloat* ggeo,
+                              @restrict const dfloat* wJ,
                               @restrict const dfloat* MM,
                               @restrict const dfloat* x,
                               @restrict const dfloat* y,
@@ -47,11 +47,9 @@ SOFTWARE.
       s_q[n] = f;
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_Np;++n;@inner(0)){
-      const dlong gid = e*p_Nggeo;
-      const dfloat J = ggeo[gid + p_GWJID];
+      const dfloat J = wJ[e];
 
       dfloat r_qM = 0.0;
 
diff --git a/solvers/elliptic/okl/ellipticSEMFEMAnterp.okl b/solvers/elliptic/okl/ellipticSEMFEMAnterp.okl
index 761b5a996..cb15c918e 100644
--- a/solvers/elliptic/okl/ellipticSEMFEMAnterp.okl
+++ b/solvers/elliptic/okl/ellipticSEMFEMAnterp.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -45,7 +45,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
diff --git a/solvers/elliptic/okl/ellipticSEMFEMInterp.okl b/solvers/elliptic/okl/ellipticSEMFEMInterp.okl
index 1caa17d76..9b008be08 100644
--- a/solvers/elliptic/okl/ellipticSEMFEMInterp.okl
+++ b/solvers/elliptic/okl/ellipticSEMFEMInterp.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -45,7 +45,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(1)){
       for(int n=0;n<p_NpFEM;++n;@inner(0)){
diff --git a/solvers/elliptic/src/ellipticBoundarySetup.cpp b/solvers/elliptic/src/ellipticBoundarySetup.cpp
index 52c96e43b..6d77a330a 100644
--- a/solvers/elliptic/src/ellipticBoundarySetup.cpp
+++ b/solvers/elliptic/src/ellipticBoundarySetup.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,126 +25,116 @@ SOFTWARE.
 */
 
 #include "elliptic.hpp"
+#include <limits>
 
 void elliptic_t::BoundarySetup(){
 
   //check all the bounaries for a Dirichlet
-  int localAllNeumann = (lambda==0) ? 1 : 0; //if lambda>0 we don't care about all Neumann problem
+  allNeumann = (lambda==0) ? 1 : 0; //if lambda>0 we don't care about all Neumann problem
   allNeumannPenalty = 1.;
 
-  //setup normalization constant
-  if (settings.compareSetting("DISCRETIZATION","IPDG")) {
-    allNeumannScale = 1./sqrt((dfloat)mesh.Np*mesh.NelementsGlobal);
-  } else {
-    //note that we can use the mesh ogs, since there are no masked nodes
-    allNeumannScale = 1./sqrt((dfloat)mesh.ogs->NgatherGlobal);
-  }
-
-  //setup a custom element-to-boundaryflag mapping
-  EToB = (int *) calloc(mesh.Nelements*mesh.Nfaces,sizeof(int));
+  //translate the mesh's element-to-boundaryflag mapping
+  EToB.malloc(mesh.Nelements*mesh.Nfaces, 0);
   for (dlong e=0;e<mesh.Nelements;e++) {
     for (int f=0;f<mesh.Nfaces;f++) {
       int bc = mesh.EToB[e*mesh.Nfaces+f];
       if (bc>0) {
-        int BC = BCType[bc];  //translate mesh's boundary flag
-        EToB[e*mesh.Nfaces+f] = BC;    //record it
-        if (BC!=2) localAllNeumann = 0;     //check if its a Dirchlet
+        int BC = BCType[bc];         //translate mesh's boundary flag
+        EToB[e*mesh.Nfaces+f] = BC;  //record it
+        if (BC!=2) allNeumann = 0;   //check if its a Dirchlet
       }
     }
   }
-  o_EToB = platform.malloc(mesh.Nelements*mesh.Nfaces*sizeof(int), EToB);
+  o_EToB = platform.malloc<int>(EToB);
 
   //collect the allNeumann flags from other ranks
-  MPI_Allreduce(&localAllNeumann, &allNeumann, 1, MPI_INT, MPI_MIN, mesh.comm);
-
+  mesh.comm.Allreduce(allNeumann, Comm::Min);
 
-  //make a node-wise bc flag using the gsop (prioritize Dirichlet boundaries over Neumann)
-  mapB = (int *) calloc(mesh.Nelements*mesh.Np,sizeof(int));
-  const int largeNumber = 1<<20;
-  for (dlong e=0;e<mesh.Nelements;e++) {
-    for (int n=0;n<mesh.Np;n++) mapB[n+e*mesh.Np] = largeNumber;
-    for (int f=0;f<mesh.Nfaces;f++) {
-      int bc = EToB[f+e*mesh.Nfaces];
-      if (bc>0) {
-        for (int n=0;n<mesh.Nfp;n++) {
-          int fid = mesh.faceNodes[n+f*mesh.Nfp];
-          mapB[fid+e*mesh.Np] = mymin(bc,mapB[fid+e*mesh.Np]);
-        }
-      }
-    }
-  }
-  mesh.ogs->GatherScatter(mapB, ogs_int, ogs_min, ogs_sym);
-
-  //use the bc flags to find masked ids
+  //translate the mesh's node-wise bc flag
   Nmasked = 0;
-  for (dlong n=0;n<mesh.Nelements*mesh.Np;n++) {
-    if (mapB[n] == largeNumber) {//no boundary
-      mapB[n] = 0.;
-    } else if (mapB[n] == 1) {   //Dirichlet boundary
-      Nmasked++;
+  mapB.malloc((mesh.Nelements+mesh.totalHaloPairs)*mesh.Np, 0);
+  for (int n=0;n<mesh.Nelements*mesh.Np;n++) {
+    int bc = mesh.mapB[n];
+    if (bc>0) {
+      int BC = BCType[bc];     //translate mesh's boundary flag
+      mapB[n] = BC;  //record it
+
+      if (mapB[n] == 1) Nmasked++;   //Dirichlet boundary
     }
   }
-  o_mapB = platform.malloc(mesh.Nelements*mesh.Np*sizeof(int), mapB);
+  o_mapB = platform.malloc<int>(mapB);
 
-
-  maskIds = (dlong *) calloc(Nmasked, sizeof(dlong));
+  maskIds.malloc(Nmasked);
   Nmasked =0; //reset
-  for (dlong n=0;n<mesh.Nelements*mesh.Np;n++)
+  for (dlong n=0;n<mesh.Nelements*mesh.Np;n++) {
     if (mapB[n] == 1) maskIds[Nmasked++] = n;
-
-  if (Nmasked) o_maskIds = platform.malloc(Nmasked*sizeof(dlong), maskIds);
+  }
+  o_maskIds = platform.malloc<int>(maskIds);
 
   //make a masked version of the global id numbering
-  maskedGlobalIds = (hlong *) calloc(mesh.Nelements*mesh.Np,sizeof(hlong));
-  memcpy(maskedGlobalIds, mesh.globalIds, mesh.Nelements*mesh.Np*sizeof(hlong));
-  for (dlong n=0;n<Nmasked;n++)
+  maskedGlobalIds.malloc(mesh.Nelements*mesh.Np);
+  maskedGlobalIds.copyFrom(mesh.globalIds);
+  for (dlong n=0;n<Nmasked;n++) {
     maskedGlobalIds[maskIds[n]] = 0;
+  }
 
   //use the masked ids to make another gs handle (signed so the gather is defined)
-  int verbose = 0;
-  ogs_t::Unique(maskedGlobalIds, mesh.Nelements*mesh.Np, mesh.comm);     //flag a unique node in every gather node
-  ogsMasked = ogs_t::Setup(mesh.Nelements*mesh.Np, maskedGlobalIds,
-                           mesh.comm, verbose, platform);
+  bool verbose = settings.compareSetting("VERBOSE", "TRUE") ? true : false;
+  bool unique = true; //flag a unique node in every gather node
+  ogsMasked.Setup(mesh.Nelements*mesh.Np, maskedGlobalIds,
+                  mesh.comm, ogs::Signed, ogs::Auto,
+                  unique, verbose, platform);
+
+  //setup normalization constant
+  if (settings.compareSetting("DISCRETIZATION","IPDG")) {
+    allNeumannScale = 1./sqrt((dfloat)mesh.Np*mesh.NelementsGlobal);
+  } else {
+    //note that we can use the mesh ogs, since there are no masked nodes
+    allNeumannScale = 1./sqrt((dfloat)ogsMasked.NgatherGlobal);
+  }
 
   /* use the masked gs handle to define a global ordering */
   dlong Ntotal  = mesh.Np*mesh.Nelements; // number of degrees of freedom on this rank (before gathering)
-  hlong Ngather = ogsMasked->Ngather;     // number of degrees of freedom on this rank (after gathering)
+  hlong Ngather = ogsMasked.Ngather;     // number of degrees of freedom on this rank (after gathering)
 
   // build inverse degree vectors
   // used for the weight in linear solvers (used in C0)
-  weight  = (dfloat*) calloc(Ntotal, sizeof(dfloat));
-  weightG = (dfloat*) calloc(ogsMasked->Ngather, sizeof(dfloat));
-  for(dlong n=0;n<Ntotal;++n) weight[n] = 1.0;
+  weight.malloc(Ntotal, 1.0);
 
-  ogsMasked->Gather(weightG, weight, ogs_dfloat, ogs_add, ogs_trans);
-  for(dlong n=0;n<ogsMasked->Ngather;++n)
-    if (weightG[n]) weightG[n] = 1./weightG[n];
+  weightG.malloc(Ngather);
+  ogsMasked.Gather(weightG, weight, 1, ogs::Add, ogs::Trans);
 
-  ogsMasked->Scatter(weight, weightG, ogs_dfloat, ogs_add, ogs_notrans);
+  for(dlong n=0;n<ogsMasked.Ngather;++n) {
+    if (weightG[n]>0.0) weightG[n] = 1./weightG[n];
+  }
+
+  ogsMasked.Scatter(weight, weightG, 1, ogs::NoTrans);
 
-  o_weight  = platform.malloc(Ntotal*sizeof(dfloat), weight);
-  o_weightG = platform.malloc(ogsMasked->Ngather*sizeof(dfloat), weightG);
+  o_weight  = platform.malloc<dfloat>(weight);
+  o_weightG = platform.malloc<dfloat>(weightG);
 
   // create a global numbering system
-  hlong *globalIds = (hlong *) calloc(Ngather,sizeof(hlong));
+  memory<hlong> globalIds(Ngather);
 
   // every gathered degree of freedom has its own global id
-  hlong *globalStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong));
-  MPI_Allgather(&Ngather, 1, MPI_HLONG, globalStarts+1, 1, MPI_HLONG, mesh.comm);
-  for(int r=0;r<mesh.size;++r)
-    globalStarts[r+1] = globalStarts[r] + globalStarts[r+1];
+  hlong globalOffset=static_cast<hlong>(Ngather);
+  comm.Scan(Ngather, globalOffset);
+  globalOffset = globalOffset-Ngather;
 
   //use the offsets to set a consecutive global numbering
-  for (dlong n =0;n<ogsMasked->Ngather;n++) {
-    globalIds[n] = n + globalStarts[mesh.rank];
+  for (dlong n =0;n<ogsMasked.Ngather;n++) {
+    globalIds[n] = n + globalOffset;
   }
 
   //scatter this numbering to the original nodes
-  maskedGlobalNumbering = (hlong *) calloc(Ntotal,sizeof(hlong));
-  for (dlong n=0;n<Ntotal;n++) maskedGlobalNumbering[n] = -1;
-  ogsMasked->Scatter(maskedGlobalNumbering, globalIds, ogs_hlong, ogs_add, ogs_notrans);
-  free(globalIds);
+  maskedGlobalNumbering.malloc(Ntotal, -1);
+  ogsMasked.Scatter(maskedGlobalNumbering, globalIds, 1, ogs::NoTrans);
 
   /* Build halo exchange for gathered ordering */
-  ogsMasked->GatheredHaloExchangeSetup();
+  gHalo.SetupFromGather(ogsMasked);
+
+  GlobalToLocal.malloc(mesh.Nelements*mesh.Np);
+  ogsMasked.SetupGlobalToLocalMapping(GlobalToLocal);
+
+  o_GlobalToLocal = platform.malloc<dlong>(GlobalToLocal);
 }
diff --git a/solvers/elliptic/src/ellipticBuildOperatorDiagonal.cpp b/solvers/elliptic/src/ellipticBuildOperatorDiagonal.cpp
index 23669611c..a984b8546 100644
--- a/solvers/elliptic/src/ellipticBuildOperatorDiagonal.cpp
+++ b/solvers/elliptic/src/ellipticBuildOperatorDiagonal.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,16 +25,14 @@ SOFTWARE.
 */
 
 #include "elliptic.hpp"
-#include "mesh/meshDefines2D.h"
-#include "mesh/meshDefines3D.h"
 
-void elliptic_t::BuildOperatorDiagonal(dfloat *diagA){
+void elliptic_t::BuildOperatorDiagonal(memory<dfloat>& diagA){
 
-  if(mesh.rank==0) {printf("Building diagonal...");fflush(stdout);}
+  if(Comm::World().rank()==0) {printf("Building diagonal...");fflush(stdout);}
 
   if (settings.compareSetting("DISCRETIZATION","IPDG")) {
     switch(mesh.elementType){
-      case TRIANGLES:
+      case Mesh::TRIANGLES:
       {
         if(mesh.dim==2)
           BuildOperatorDiagonalIpdgTri2D(diagA);
@@ -42,24 +40,24 @@ void elliptic_t::BuildOperatorDiagonal(dfloat *diagA){
           BuildOperatorDiagonalIpdgTri3D(diagA);
         break;
       }
-      case QUADRILATERALS:
+      case Mesh::QUADRILATERALS:
         BuildOperatorDiagonalIpdgQuad2D(diagA);
         break;
-      case TETRAHEDRA:
+      case Mesh::TETRAHEDRA:
         BuildOperatorDiagonalIpdgTet3D(diagA);
         break;
-      case HEXAHEDRA:
+      case Mesh::HEXAHEDRA:
         BuildOperatorDiagonalIpdgHex3D(diagA);
         break;
     }
   } else if (settings.compareSetting("DISCRETIZATION","CONTINUOUS")) {
-    dfloat* diagAL = (dfloat*) malloc(mesh.Np*mesh.Nelements*sizeof(dfloat));
+    memory<dfloat> diagAL(mesh.Np*mesh.Nelements);
 
     switch(mesh.elementType){
-      case TRIANGLES:
+      case Mesh::TRIANGLES:
         BuildOperatorDiagonalContinuousTri2D(diagAL);
         break;
-      case QUADRILATERALS:
+      case Mesh::QUADRILATERALS:
       {
         if(mesh.dim==2)
           BuildOperatorDiagonalContinuousQuad2D(diagAL);
@@ -67,25 +65,24 @@ void elliptic_t::BuildOperatorDiagonal(dfloat *diagA){
           BuildOperatorDiagonalContinuousQuad3D(diagAL);
         break;
       }
-      case TETRAHEDRA:
+      case Mesh::TETRAHEDRA:
         BuildOperatorDiagonalContinuousTet3D(diagAL);
         break;
-      case HEXAHEDRA:
+      case Mesh::HEXAHEDRA:
         BuildOperatorDiagonalContinuousHex3D(diagAL);
         break;
     }
 
     //gather the diagonal to assemble it
-    ogsMasked->Gather(diagA, diagAL, ogs_dfloat, ogs_add, ogs_trans);
-    free(diagAL);
+    ogsMasked.Gather(diagA, diagAL, 1, ogs::Add, ogs::Trans);
   }
-  if(mesh.rank==0) printf("done.\n");
+  if(Comm::World().rank()==0) printf("done.\n");
 }
 
-void elliptic_t::BuildOperatorDiagonalIpdgTri2D(dfloat *A) {
+void elliptic_t::BuildOperatorDiagonalIpdgTri2D(memory<dfloat>& A) {
 
   // surface mass matrices MS = MM*LIFT
-  dfloat *MS = (dfloat *) calloc(mesh.Nfaces*mesh.Nfp*mesh.Nfp,sizeof(dfloat));
+  memory<dfloat> MS(mesh.Nfaces*mesh.Nfp*mesh.Nfp);
   for (int f=0;f<mesh.Nfaces;f++) {
     for (int n=0;n<mesh.Nfp;n++) {
       int fn = mesh.faceNodes[f*mesh.Nfp+n];
@@ -104,11 +101,11 @@ void elliptic_t::BuildOperatorDiagonalIpdgTri2D(dfloat *A) {
 
   for(dlong eM=0;eM<mesh.Nelements;++eM){
     dlong vbase = eM*mesh.Nvgeo;
-    dfloat drdx = mesh.vgeo[vbase+RXID];
-    dfloat drdy = mesh.vgeo[vbase+RYID];
-    dfloat dsdx = mesh.vgeo[vbase+SXID];
-    dfloat dsdy = mesh.vgeo[vbase+SYID];
-    dfloat J = mesh.vgeo[vbase+JID];
+    dfloat drdx = mesh.vgeo[vbase+mesh.RXID];
+    dfloat drdy = mesh.vgeo[vbase+mesh.RYID];
+    dfloat dsdx = mesh.vgeo[vbase+mesh.SXID];
+    dfloat dsdy = mesh.vgeo[vbase+mesh.SYID];
+    dfloat J = mesh.wJ[eM];
 
     /* start with stiffness matrix  */
     for(int n=0;n<mesh.Np;++n){
@@ -131,10 +128,10 @@ void elliptic_t::BuildOperatorDiagonalIpdgTri2D(dfloat *A) {
     for (int fM=0;fM<mesh.Nfaces;fM++) {
       // load surface geofactors for this face
       dlong sid = mesh.Nsgeo*(eM*mesh.Nfaces+fM);
-      dfloat nx = mesh.sgeo[sid+NXID];
-      dfloat ny = mesh.sgeo[sid+NYID];
-      dfloat sJ = mesh.sgeo[sid+SJID];
-      dfloat hinv = mesh.sgeo[sid+IHID];
+      dfloat nx = mesh.sgeo[sid+mesh.NXID];
+      dfloat ny = mesh.sgeo[sid+mesh.NYID];
+      dfloat sJ = mesh.sgeo[sid+mesh.SJID];
+      dfloat hinv = mesh.sgeo[sid+mesh.IHID];
 
       int bc = mesh.EToB[fM+mesh.Nfaces*eM]; //raw boundary flag
 
@@ -155,7 +152,7 @@ void elliptic_t::BuildOperatorDiagonalIpdgTri2D(dfloat *A) {
       }
 
       // mass matrix for this face
-      dfloat *MSf = MS+fM*mesh.Nfp*mesh.Nfp;
+      memory<dfloat> MSf = MS+fM*mesh.Nfp*mesh.Nfp;
 
       // penalty term just involves face nodes
       for(int n=0;n<mesh.Nfp;++n){
@@ -211,14 +208,12 @@ void elliptic_t::BuildOperatorDiagonalIpdgTri2D(dfloat *A) {
       }
     }
   }
-
-  free(MS);
 }
 
-void elliptic_t::BuildOperatorDiagonalIpdgTri3D(dfloat *A) {
+void elliptic_t::BuildOperatorDiagonalIpdgTri3D(memory<dfloat>& A) {
 
   // surface mass matrices MS = MM*LIFT
-  dfloat *MS = (dfloat *) calloc(mesh.Nfaces*mesh.Nfp*mesh.Nfp,sizeof(dfloat));
+  memory<dfloat> MS(mesh.Nfaces*mesh.Nfp*mesh.Nfp);
   for (int f=0;f<mesh.Nfaces;f++) {
     for (int n=0;n<mesh.Nfp;n++) {
       int fn = mesh.faceNodes[f*mesh.Nfp+n];
@@ -237,13 +232,13 @@ void elliptic_t::BuildOperatorDiagonalIpdgTri3D(dfloat *A) {
 
   for(dlong eM=0;eM<mesh.Nelements;++eM){
     dlong vbase = eM*mesh.Nvgeo;
-    dfloat drdx = mesh.vgeo[vbase+RXID];
-    dfloat drdy = mesh.vgeo[vbase+RYID];
-    dfloat drdz = mesh.vgeo[vbase+RZID];
-    dfloat dsdx = mesh.vgeo[vbase+SXID];
-    dfloat dsdy = mesh.vgeo[vbase+SYID];
-    dfloat dsdz = mesh.vgeo[vbase+SZID];
-    dfloat J = mesh.vgeo[vbase+JID];
+    dfloat drdx = mesh.vgeo[vbase+mesh.RXID];
+    dfloat drdy = mesh.vgeo[vbase+mesh.RYID];
+    dfloat drdz = mesh.vgeo[vbase+mesh.RZID];
+    dfloat dsdx = mesh.vgeo[vbase+mesh.SXID];
+    dfloat dsdy = mesh.vgeo[vbase+mesh.SYID];
+    dfloat dsdz = mesh.vgeo[vbase+mesh.SZID];
+    dfloat J = mesh.wJ[eM];
 
     /* start with stiffness matrix  */
     for(int n=0;n<mesh.Np;++n){
@@ -269,11 +264,11 @@ void elliptic_t::BuildOperatorDiagonalIpdgTri3D(dfloat *A) {
     for (int fM=0;fM<mesh.Nfaces;fM++) {
       // load surface geofactors for this face
       dlong sid = mesh.Nsgeo*(eM*mesh.Nfaces+fM);
-      dfloat nx = mesh.sgeo[sid+NXID];
-      dfloat ny = mesh.sgeo[sid+NYID];
-      dfloat nz = mesh.sgeo[sid+NZID];
-      dfloat sJ = mesh.sgeo[sid+SJID];
-      dfloat hinv = mesh.sgeo[sid+IHID];
+      dfloat nx = mesh.sgeo[sid+mesh.NXID];
+      dfloat ny = mesh.sgeo[sid+mesh.NYID];
+      dfloat nz = mesh.sgeo[sid+mesh.NZID];
+      dfloat sJ = mesh.sgeo[sid+mesh.SJID];
+      dfloat hinv = mesh.sgeo[sid+mesh.IHID];
 
       int bc = mesh.EToB[fM+mesh.Nfaces*eM]; //raw boundary flag
 
@@ -294,7 +289,7 @@ void elliptic_t::BuildOperatorDiagonalIpdgTri3D(dfloat *A) {
       }
 
       // mass matrix for this face
-      dfloat *MSf = MS+fM*mesh.Nfp*mesh.Nfp;
+      memory<dfloat> MSf = MS+fM*mesh.Nfp*mesh.Nfp;
 
       // penalty term just involves face nodes
       for(int n=0;n<mesh.Nfp;++n){
@@ -354,18 +349,16 @@ void elliptic_t::BuildOperatorDiagonalIpdgTri3D(dfloat *A) {
       }
     }
   }
-
-  free(MS);
 }
 
-void elliptic_t::BuildOperatorDiagonalContinuousTri2D(dfloat *A) {
+void elliptic_t::BuildOperatorDiagonalContinuousTri2D(memory<dfloat>& A) {
 
   for(dlong eM=0;eM<mesh.Nelements;++eM){
     dlong gbase = eM*mesh.Nggeo;
-    dfloat Grr = mesh.ggeo[gbase + G00ID];
-    dfloat Grs = mesh.ggeo[gbase + G01ID];
-    dfloat Gss = mesh.ggeo[gbase + G11ID];
-    dfloat J   = mesh.ggeo[gbase + GWJID];
+    dfloat Grr = mesh.ggeo[gbase + mesh.G00ID];
+    dfloat Grs = mesh.ggeo[gbase + mesh.G01ID];
+    dfloat Gss = mesh.ggeo[gbase + mesh.G11ID];
+    dfloat J   = mesh.wJ[eM];
 
     /* start with stiffness matrix  */
     for(int n=0;n<mesh.Np;++n){
@@ -390,12 +383,12 @@ void elliptic_t::BuildOperatorDiagonalContinuousTri2D(dfloat *A) {
   }
 }
 
-void elliptic_t::BuildOperatorDiagonalIpdgQuad2D(dfloat *A) {
+void elliptic_t::BuildOperatorDiagonalIpdgQuad2D(memory<dfloat>& A) {
 
   // build some monolithic basis arrays (for quads and hexes)
-  dfloat *B  = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat));
-  dfloat *Br = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat));
-  dfloat *Bs = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat));
+  memory<dfloat> B (mesh.Np*mesh.Np, 0.0);
+  memory<dfloat> Br(mesh.Np*mesh.Np, 0.0);
+  memory<dfloat> Bs(mesh.Np*mesh.Np, 0.0);
 
   int mode = 0;
   for(int nj=0;nj<mesh.N+1;++nj){
@@ -428,11 +421,11 @@ void elliptic_t::BuildOperatorDiagonalIpdgQuad2D(dfloat *A) {
       // (grad phi_n, grad phi_m)_{D^e}
       for(int i=0;i<mesh.Np;++i){
         dlong base = eM*mesh.Np*mesh.Nvgeo + i;
-        dfloat drdx = mesh.vgeo[base+mesh.Np*RXID];
-        dfloat drdy = mesh.vgeo[base+mesh.Np*RYID];
-        dfloat dsdx = mesh.vgeo[base+mesh.Np*SXID];
-        dfloat dsdy = mesh.vgeo[base+mesh.Np*SYID];
-        dfloat JW   = mesh.vgeo[base+mesh.Np*JWID];
+        dfloat drdx = mesh.vgeo[base+mesh.Np*mesh.RXID];
+        dfloat drdy = mesh.vgeo[base+mesh.Np*mesh.RYID];
+        dfloat dsdx = mesh.vgeo[base+mesh.Np*mesh.SXID];
+        dfloat dsdy = mesh.vgeo[base+mesh.Np*mesh.SYID];
+        dfloat JW   = mesh.wJ[eM*mesh.Np + i];
 
         int idn = n*mesh.Np+i;
         dfloat dlndx = drdx*Br[idn] + dsdx*Bs[idn];
@@ -448,17 +441,17 @@ void elliptic_t::BuildOperatorDiagonalIpdgQuad2D(dfloat *A) {
 
           // grab vol geofacs at surface nodes
           dlong baseM = eM*mesh.Np*mesh.Nvgeo + vidM;
-          dfloat drdxM = mesh.vgeo[baseM+mesh.Np*RXID];
-          dfloat drdyM = mesh.vgeo[baseM+mesh.Np*RYID];
-          dfloat dsdxM = mesh.vgeo[baseM+mesh.Np*SXID];
-          dfloat dsdyM = mesh.vgeo[baseM+mesh.Np*SYID];
+          dfloat drdxM = mesh.vgeo[baseM+mesh.Np*mesh.RXID];
+          dfloat drdyM = mesh.vgeo[baseM+mesh.Np*mesh.RYID];
+          dfloat dsdxM = mesh.vgeo[baseM+mesh.Np*mesh.SXID];
+          dfloat dsdyM = mesh.vgeo[baseM+mesh.Np*mesh.SYID];
 
           // grab surface geometric factors
           dlong base = mesh.Nsgeo*(eM*mesh.Nfp*mesh.Nfaces + fM*mesh.Nfp + i);
-          dfloat nx = mesh.sgeo[base+NXID];
-          dfloat ny = mesh.sgeo[base+NYID];
-          dfloat wsJ = mesh.sgeo[base+WSJID];
-          dfloat hinv = mesh.sgeo[base+IHID];
+          dfloat nx = mesh.sgeo[base+mesh.NXID];
+          dfloat ny = mesh.sgeo[base+mesh.NYID];
+          dfloat wsJ = mesh.sgeo[base+mesh.WSJID];
+          dfloat hinv = mesh.sgeo[base+mesh.IHID];
 
           // form negative trace terms in IPDG
           int idnM = n*mesh.Np+vidM;
@@ -492,11 +485,9 @@ void elliptic_t::BuildOperatorDiagonalIpdgQuad2D(dfloat *A) {
       }
     }
   }
-
-  free(B); free(Br); free(Bs);
 }
 
-void elliptic_t::BuildOperatorDiagonalContinuousQuad2D(dfloat *A) {
+void elliptic_t::BuildOperatorDiagonalContinuousQuad2D(memory<dfloat>& A) {
 
   for(dlong eM=0;eM<mesh.Nelements;++eM){
     for (int ny=0;ny<mesh.Nq;ny++) {
@@ -507,21 +498,21 @@ void elliptic_t::BuildOperatorDiagonalContinuousQuad2D(dfloat *A) {
 
           for (int k=0;k<mesh.Nq;k++) {
             int id = k+ny*mesh.Nq;
-            dfloat Grr = mesh.ggeo[eM*mesh.Np*mesh.Nggeo + id + G00ID*mesh.Np];
+            dfloat Grr = mesh.ggeo[eM*mesh.Np*mesh.Nggeo + id + mesh.G00ID*mesh.Np];
             A[eM*mesh.Np+iid] += Grr*mesh.D[nx+k*mesh.Nq]*mesh.D[nx+k*mesh.Nq];
           }
 
           for (int k=0;k<mesh.Nq;k++) {
             int id = nx+k*mesh.Nq;
-            dfloat Gss = mesh.ggeo[eM*mesh.Np*mesh.Nggeo + id + G11ID*mesh.Np];
+            dfloat Gss = mesh.ggeo[eM*mesh.Np*mesh.Nggeo + id + mesh.G11ID*mesh.Np];
             A[eM*mesh.Np+iid] += Gss*mesh.D[ny+k*mesh.Nq]*mesh.D[ny+k*mesh.Nq];
           }
 
           int id = nx+ny*mesh.Nq;
-          dfloat Grs = mesh.ggeo[eM*mesh.Np*mesh.Nggeo + id + G01ID*mesh.Np];
+          dfloat Grs = mesh.ggeo[eM*mesh.Np*mesh.Nggeo + id + mesh.G01ID*mesh.Np];
           A[eM*mesh.Np+iid] += 2*Grs*mesh.D[nx+nx*mesh.Nq]*mesh.D[ny+ny*mesh.Nq];
 
-          dfloat JW = mesh.ggeo[eM*mesh.Np*mesh.Nggeo + id + GWJID*mesh.Np];
+          dfloat JW = mesh.wJ[eM*mesh.Np + iid];
           A[eM*mesh.Np+iid] += JW*lambda;
 
         } else {
@@ -542,12 +533,12 @@ void elliptic_t::BuildOperatorDiagonalContinuousQuad2D(dfloat *A) {
 }
 
 
-void elliptic_t::BuildOperatorDiagonalIpdgQuad3D(dfloat *A) {
+void elliptic_t::BuildOperatorDiagonalIpdgQuad3D(memory<dfloat>& A) {
 
   // build some monolithic basis arrays (for quads and hexes)
-  dfloat *B  = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat));
-  dfloat *Br = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat));
-  dfloat *Bs = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat));
+  memory<dfloat> B (mesh.Np*mesh.Np, 0.0);
+  memory<dfloat> Br(mesh.Np*mesh.Np, 0.0);
+  memory<dfloat> Bs(mesh.Np*mesh.Np, 0.0);
 
   int mode = 0;
   for(int nj=0;nj<mesh.N+1;++nj){
@@ -580,16 +571,16 @@ void elliptic_t::BuildOperatorDiagonalIpdgQuad3D(dfloat *A) {
       // (grad phi_n, grad phi_m)_{D^e}
       for(int i=0;i<mesh.Np;++i){
         dlong base = eM*mesh.Np*mesh.Nvgeo + i;
-        dfloat drdx = mesh.vgeo[base+mesh.Np*RXID];
-        dfloat drdy = mesh.vgeo[base+mesh.Np*RYID];
-        dfloat drdz = mesh.vgeo[base+mesh.Np*RZID];
-        dfloat dsdx = mesh.vgeo[base+mesh.Np*SXID];
-        dfloat dsdy = mesh.vgeo[base+mesh.Np*SYID];
-        dfloat dsdz = mesh.vgeo[base+mesh.Np*SZID];
-        dfloat dtdx = mesh.vgeo[base+mesh.Np*TXID];
-        dfloat dtdy = mesh.vgeo[base+mesh.Np*TYID];
-        dfloat dtdz = mesh.vgeo[base+mesh.Np*TZID];
-        dfloat JW   = mesh.vgeo[base+mesh.Np*JWID];
+        dfloat drdx = mesh.vgeo[base+mesh.Np*mesh.RXID];
+        dfloat drdy = mesh.vgeo[base+mesh.Np*mesh.RYID];
+        dfloat drdz = mesh.vgeo[base+mesh.Np*mesh.RZID];
+        dfloat dsdx = mesh.vgeo[base+mesh.Np*mesh.SXID];
+        dfloat dsdy = mesh.vgeo[base+mesh.Np*mesh.SYID];
+        dfloat dsdz = mesh.vgeo[base+mesh.Np*mesh.SZID];
+        dfloat dtdx = mesh.vgeo[base+mesh.Np*mesh.TXID];
+        dfloat dtdy = mesh.vgeo[base+mesh.Np*mesh.TYID];
+        dfloat dtdz = mesh.vgeo[base+mesh.Np*mesh.TZID];
+        dfloat JW   = mesh.wJ[eM*mesh.Np + i];
 
         int idn = n*mesh.Np+i;
         dfloat dlndx = drdx*Br[idn] + dsdx*Bs[idn] + dtdx;
@@ -606,25 +597,25 @@ void elliptic_t::BuildOperatorDiagonalIpdgQuad3D(dfloat *A) {
 
           // grab vol geofacs at surface nodes
           dlong baseM = eM*mesh.Np*mesh.Nvgeo + vidM;
-          dfloat drdxM = mesh.vgeo[baseM+mesh.Np*RXID];
-          dfloat drdyM = mesh.vgeo[baseM+mesh.Np*RYID];
-          dfloat drdzM = mesh.vgeo[baseM+mesh.Np*RZID];
+          dfloat drdxM = mesh.vgeo[baseM+mesh.Np*mesh.RXID];
+          dfloat drdyM = mesh.vgeo[baseM+mesh.Np*mesh.RYID];
+          dfloat drdzM = mesh.vgeo[baseM+mesh.Np*mesh.RZID];
 
-          dfloat dsdxM = mesh.vgeo[baseM+mesh.Np*SXID];
-          dfloat dsdyM = mesh.vgeo[baseM+mesh.Np*SYID];
-          dfloat dsdzM = mesh.vgeo[baseM+mesh.Np*SZID];
+          dfloat dsdxM = mesh.vgeo[baseM+mesh.Np*mesh.SXID];
+          dfloat dsdyM = mesh.vgeo[baseM+mesh.Np*mesh.SYID];
+          dfloat dsdzM = mesh.vgeo[baseM+mesh.Np*mesh.SZID];
 
-          dfloat dtdxM = mesh.vgeo[baseM+mesh.Np*TXID];
-          dfloat dtdyM = mesh.vgeo[baseM+mesh.Np*TYID];
-          dfloat dtdzM = mesh.vgeo[baseM+mesh.Np*TZID];
+          dfloat dtdxM = mesh.vgeo[baseM+mesh.Np*mesh.TXID];
+          dfloat dtdyM = mesh.vgeo[baseM+mesh.Np*mesh.TYID];
+          dfloat dtdzM = mesh.vgeo[baseM+mesh.Np*mesh.TZID];
 
           // grab surface geometric factors
           dlong base = mesh.Nsgeo*(eM*mesh.Nfp*mesh.Nfaces + fM*mesh.Nfp + i);
-          dfloat nx = mesh.sgeo[base+NXID];
-          dfloat ny = mesh.sgeo[base+NYID];
-          dfloat nz = mesh.sgeo[base+NZID];
-          dfloat wsJ = mesh.sgeo[base+WSJID];
-          dfloat hinv = mesh.sgeo[base+IHID];
+          dfloat nx = mesh.sgeo[base+mesh.NXID];
+          dfloat ny = mesh.sgeo[base+mesh.NYID];
+          dfloat nz = mesh.sgeo[base+mesh.NZID];
+          dfloat wsJ = mesh.sgeo[base+mesh.WSJID];
+          dfloat hinv = mesh.sgeo[base+mesh.IHID];
 
           // form negative trace terms in IPDG
           int idnM = n*mesh.Np+vidM;
@@ -644,11 +635,9 @@ void elliptic_t::BuildOperatorDiagonalIpdgQuad3D(dfloat *A) {
       }
     }
   }
-
-  free(B); free(Br); free(Bs);
 }
 
-void elliptic_t::BuildOperatorDiagonalContinuousQuad3D(dfloat *A) {
+void elliptic_t::BuildOperatorDiagonalContinuousQuad3D(memory<dfloat>& A) {
 
   for(dlong eM=0;eM<mesh.Nelements;++eM){
     for (int ny=0;ny<mesh.Nq;ny++) {
@@ -658,34 +647,34 @@ void elliptic_t::BuildOperatorDiagonalContinuousQuad3D(dfloat *A) {
 
         for (int k=0;k<mesh.Nq;k++) {
           int id = k+ny*mesh.Nq;
-          dfloat Grr = mesh.ggeo[eM*mesh.Np*mesh.Nggeo + id + G00ID*mesh.Np];
+          dfloat Grr = mesh.ggeo[eM*mesh.Np*mesh.Nggeo + id + mesh.G00ID*mesh.Np];
           A[eM*mesh.Np+iid] += Grr*mesh.D[nx+k*mesh.Nq]*mesh.D[nx+k*mesh.Nq];
         }
 
         for (int k=0;k<mesh.Nq;k++) {
           int id = nx+k*mesh.Nq;
-          dfloat Gss = mesh.ggeo[eM*mesh.Np*mesh.Nggeo + id + G11ID*mesh.Np];
+          dfloat Gss = mesh.ggeo[eM*mesh.Np*mesh.Nggeo + id + mesh.G11ID*mesh.Np];
           A[eM*mesh.Np+iid] += Gss*mesh.D[ny+k*mesh.Nq]*mesh.D[ny+k*mesh.Nq];
         }
 
         int id = nx+ny*mesh.Nq;
-        dfloat Grs = mesh.ggeo[eM*mesh.Np*mesh.Nggeo + id + G01ID*mesh.Np];
+        dfloat Grs = mesh.ggeo[eM*mesh.Np*mesh.Nggeo + id + mesh.G01ID*mesh.Np];
         A[eM*mesh.Np+iid] += 2*Grs*mesh.D[nx+nx*mesh.Nq]*mesh.D[ny+ny*mesh.Nq];
 
         // id = nx+ny*mesh.Nq;
-        // dfloat Grt = mesh.ggeo[eM*mesh.Np*mesh.Nggeo + id + G02ID*mesh.Np];
+        // dfloat Grt = mesh.ggeo[eM*mesh.Np*mesh.Nggeo + id + mesh.G02ID*mesh.Np];
         // A[eM*mesh.Np+iid] += 2*Grt*mesh.D[nx+nx*mesh.Nq];
 
         // id = nx+ny*mesh.Nq;
-        // dfloat Gst = mesh.ggeo[eM*mesh.Np*mesh.Nggeo + id + G12ID*mesh.Np];
+        // dfloat Gst = mesh.ggeo[eM*mesh.Np*mesh.Nggeo + id + mesh.G12ID*mesh.Np];
         // A[eM*mesh.Np+iid] += 2*Gst*mesh.D[ny+ny*mesh.Nq];
 
-        // dfloat Gtt = mesh.ggeo[eM*mesh.Np*mesh.Nggeo + id + G22ID*mesh.Np];
+        // dfloat Gtt = mesh.ggeo[eM*mesh.Np*mesh.Nggeo + id + mesh.G22ID*mesh.Np];
         // A[eM*mesh.Np+iid] += Gtt;
 
 
 
-        dfloat JW  = mesh.ggeo[eM*mesh.Np*mesh.Nggeo + id + GWJID*mesh.Np];
+        dfloat JW  = mesh.wJ[eM*mesh.Np + iid];
         A[eM*mesh.Np+iid] += JW*lambda;
       }
     }
@@ -694,10 +683,10 @@ void elliptic_t::BuildOperatorDiagonalContinuousQuad3D(dfloat *A) {
 
 
 
-void elliptic_t::BuildOperatorDiagonalIpdgTet3D(dfloat *A) {
+void elliptic_t::BuildOperatorDiagonalIpdgTet3D(memory<dfloat>& A) {
 
   // surface mass matrices MS = MM*LIFT
-  dfloat *MS = (dfloat *) calloc(mesh.Nfaces*mesh.Nfp*mesh.Nfp,sizeof(dfloat));
+  memory<dfloat> MS(mesh.Nfaces*mesh.Nfp*mesh.Nfp);
   for (int f=0;f<mesh.Nfaces;f++) {
     for (int n=0;n<mesh.Nfp;n++) {
       int fn = mesh.faceNodes[f*mesh.Nfp+n];
@@ -716,16 +705,16 @@ void elliptic_t::BuildOperatorDiagonalIpdgTet3D(dfloat *A) {
 
   for(dlong eM=0;eM<mesh.Nelements;++eM){
     dlong vbase = eM*mesh.Nvgeo;
-    dfloat drdx = mesh.vgeo[vbase+RXID];
-    dfloat drdy = mesh.vgeo[vbase+RYID];
-    dfloat drdz = mesh.vgeo[vbase+RZID];
-    dfloat dsdx = mesh.vgeo[vbase+SXID];
-    dfloat dsdy = mesh.vgeo[vbase+SYID];
-    dfloat dsdz = mesh.vgeo[vbase+SZID];
-    dfloat dtdx = mesh.vgeo[vbase+TXID];
-    dfloat dtdy = mesh.vgeo[vbase+TYID];
-    dfloat dtdz = mesh.vgeo[vbase+TZID];
-    dfloat J = mesh.vgeo[vbase+JID];
+    dfloat drdx = mesh.vgeo[vbase+mesh.RXID];
+    dfloat drdy = mesh.vgeo[vbase+mesh.RYID];
+    dfloat drdz = mesh.vgeo[vbase+mesh.RZID];
+    dfloat dsdx = mesh.vgeo[vbase+mesh.SXID];
+    dfloat dsdy = mesh.vgeo[vbase+mesh.SYID];
+    dfloat dsdz = mesh.vgeo[vbase+mesh.SZID];
+    dfloat dtdx = mesh.vgeo[vbase+mesh.TXID];
+    dfloat dtdy = mesh.vgeo[vbase+mesh.TYID];
+    dfloat dtdz = mesh.vgeo[vbase+mesh.TZID];
+    dfloat J = mesh.wJ[eM];
 
     dfloat G00 = drdx*drdx + drdy*drdy + drdz*drdz;
     dfloat G01 = drdx*dsdx + drdy*dsdy + drdz*dsdz;
@@ -757,11 +746,11 @@ void elliptic_t::BuildOperatorDiagonalIpdgTet3D(dfloat *A) {
     for (int fM=0;fM<mesh.Nfaces;fM++) {
       // load surface geofactors for this face
       dlong sid = mesh.Nsgeo*(eM*mesh.Nfaces+fM);
-      dfloat nx = mesh.sgeo[sid+NXID];
-      dfloat ny = mesh.sgeo[sid+NYID];
-      dfloat nz = mesh.sgeo[sid+NZID];
-      dfloat sJ = mesh.sgeo[sid+SJID];
-      dfloat hinv = mesh.sgeo[sid+IHID];
+      dfloat nx = mesh.sgeo[sid+mesh.NXID];
+      dfloat ny = mesh.sgeo[sid+mesh.NYID];
+      dfloat nz = mesh.sgeo[sid+mesh.NZID];
+      dfloat sJ = mesh.sgeo[sid+mesh.SJID];
+      dfloat hinv = mesh.sgeo[sid+mesh.IHID];
 
       int bc = mesh.EToB[fM+mesh.Nfaces*eM]; //raw boundary flag
 
@@ -782,7 +771,7 @@ void elliptic_t::BuildOperatorDiagonalIpdgTet3D(dfloat *A) {
       }
 
       // mass matrix for this face
-      dfloat *MSf = MS+fM*mesh.Nfp*mesh.Nfp;
+      memory<dfloat> MSf = MS+fM*mesh.Nfp*mesh.Nfp;
 
       // penalty term just involves face nodes
       for(int n=0;n<mesh.Nfp;++n){
@@ -842,21 +831,19 @@ void elliptic_t::BuildOperatorDiagonalIpdgTet3D(dfloat *A) {
       }
     }
   }
-
-  free(MS);
 }
 
-void elliptic_t::BuildOperatorDiagonalContinuousTet3D(dfloat *A) {
+void elliptic_t::BuildOperatorDiagonalContinuousTet3D(memory<dfloat>& A) {
 
   for(dlong eM=0;eM<mesh.Nelements;++eM){
     dlong gbase = eM*mesh.Nggeo;
-    dfloat Grr = mesh.ggeo[gbase + G00ID];
-    dfloat Grs = mesh.ggeo[gbase + G01ID];
-    dfloat Grt = mesh.ggeo[gbase + G02ID];
-    dfloat Gss = mesh.ggeo[gbase + G11ID];
-    dfloat Gst = mesh.ggeo[gbase + G12ID];
-    dfloat Gtt = mesh.ggeo[gbase + G22ID];
-    dfloat J   = mesh.ggeo[gbase + GWJID];
+    dfloat Grr = mesh.ggeo[gbase + mesh.G00ID];
+    dfloat Grs = mesh.ggeo[gbase + mesh.G01ID];
+    dfloat Grt = mesh.ggeo[gbase + mesh.G02ID];
+    dfloat Gss = mesh.ggeo[gbase + mesh.G11ID];
+    dfloat Gst = mesh.ggeo[gbase + mesh.G12ID];
+    dfloat Gtt = mesh.ggeo[gbase + mesh.G22ID];
+    dfloat J   = mesh.wJ[eM];
 
     /* start with stiffness matrix  */
     for(int n=0;n<mesh.Np;++n){
@@ -884,13 +871,13 @@ void elliptic_t::BuildOperatorDiagonalContinuousTet3D(dfloat *A) {
   }
 }
 
-void elliptic_t::BuildOperatorDiagonalIpdgHex3D(dfloat *A) {
+void elliptic_t::BuildOperatorDiagonalIpdgHex3D(memory<dfloat>& A) {
 
   // build some monolithic basis arrays (for quads and hexes)
-  dfloat *B  = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat));
-  dfloat *Br = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat));
-  dfloat *Bs = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat));
-  dfloat *Bt = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat));
+  memory<dfloat> B (mesh.Np*mesh.Np, 0.0);
+  memory<dfloat> Br(mesh.Np*mesh.Np, 0.0);
+  memory<dfloat> Bs(mesh.Np*mesh.Np, 0.0);
+  memory<dfloat> Bt(mesh.Np*mesh.Np, 0.0);
 
   int mode = 0;
   for(int nk=0;nk<mesh.N+1;++nk){
@@ -930,16 +917,16 @@ void elliptic_t::BuildOperatorDiagonalIpdgHex3D(dfloat *A) {
       // (grad phi_n, grad phi_m)_{D^e}
       for(int i=0;i<mesh.Np;++i){
         dlong base = eM*mesh.Np*mesh.Nvgeo + i;
-        dfloat drdx = mesh.vgeo[base+mesh.Np*RXID];
-        dfloat drdy = mesh.vgeo[base+mesh.Np*RYID];
-        dfloat drdz = mesh.vgeo[base+mesh.Np*RZID];
-        dfloat dsdx = mesh.vgeo[base+mesh.Np*SXID];
-        dfloat dsdy = mesh.vgeo[base+mesh.Np*SYID];
-        dfloat dsdz = mesh.vgeo[base+mesh.Np*SZID];
-        dfloat dtdx = mesh.vgeo[base+mesh.Np*TXID];
-        dfloat dtdy = mesh.vgeo[base+mesh.Np*TYID];
-        dfloat dtdz = mesh.vgeo[base+mesh.Np*TZID];
-        dfloat JW   = mesh.vgeo[base+mesh.Np*JWID];
+        dfloat drdx = mesh.vgeo[base+mesh.Np*mesh.RXID];
+        dfloat drdy = mesh.vgeo[base+mesh.Np*mesh.RYID];
+        dfloat drdz = mesh.vgeo[base+mesh.Np*mesh.RZID];
+        dfloat dsdx = mesh.vgeo[base+mesh.Np*mesh.SXID];
+        dfloat dsdy = mesh.vgeo[base+mesh.Np*mesh.SYID];
+        dfloat dsdz = mesh.vgeo[base+mesh.Np*mesh.SZID];
+        dfloat dtdx = mesh.vgeo[base+mesh.Np*mesh.TXID];
+        dfloat dtdy = mesh.vgeo[base+mesh.Np*mesh.TYID];
+        dfloat dtdz = mesh.vgeo[base+mesh.Np*mesh.TZID];
+        dfloat JW   = mesh.wJ[eM*mesh.Np + i];
 
         int idn = n*mesh.Np+i;
         dfloat dlndx = drdx*Br[idn] + dsdx*Bs[idn] + dtdx*Bt[idn];
@@ -956,23 +943,23 @@ void elliptic_t::BuildOperatorDiagonalIpdgHex3D(dfloat *A) {
 
           // grab vol geofacs at surface nodes
           dlong baseM = eM*mesh.Np*mesh.Nvgeo + vidM;
-          dfloat drdxM = mesh.vgeo[baseM+mesh.Np*RXID];
-          dfloat drdyM = mesh.vgeo[baseM+mesh.Np*RYID];
-          dfloat drdzM = mesh.vgeo[baseM+mesh.Np*RZID];
-          dfloat dsdxM = mesh.vgeo[baseM+mesh.Np*SXID];
-          dfloat dsdyM = mesh.vgeo[baseM+mesh.Np*SYID];
-          dfloat dsdzM = mesh.vgeo[baseM+mesh.Np*SZID];
-          dfloat dtdxM = mesh.vgeo[baseM+mesh.Np*TXID];
-          dfloat dtdyM = mesh.vgeo[baseM+mesh.Np*TYID];
-          dfloat dtdzM = mesh.vgeo[baseM+mesh.Np*TZID];
+          dfloat drdxM = mesh.vgeo[baseM+mesh.Np*mesh.RXID];
+          dfloat drdyM = mesh.vgeo[baseM+mesh.Np*mesh.RYID];
+          dfloat drdzM = mesh.vgeo[baseM+mesh.Np*mesh.RZID];
+          dfloat dsdxM = mesh.vgeo[baseM+mesh.Np*mesh.SXID];
+          dfloat dsdyM = mesh.vgeo[baseM+mesh.Np*mesh.SYID];
+          dfloat dsdzM = mesh.vgeo[baseM+mesh.Np*mesh.SZID];
+          dfloat dtdxM = mesh.vgeo[baseM+mesh.Np*mesh.TXID];
+          dfloat dtdyM = mesh.vgeo[baseM+mesh.Np*mesh.TYID];
+          dfloat dtdzM = mesh.vgeo[baseM+mesh.Np*mesh.TZID];
 
           // grab surface geometric factors
           dlong base = mesh.Nsgeo*(eM*mesh.Nfp*mesh.Nfaces + fM*mesh.Nfp + i);
-          dfloat nx = mesh.sgeo[base+NXID];
-          dfloat ny = mesh.sgeo[base+NYID];
-          dfloat nz = mesh.sgeo[base+NZID];
-          dfloat wsJ = mesh.sgeo[base+WSJID];
-          dfloat hinv = mesh.sgeo[base+IHID];
+          dfloat nx = mesh.sgeo[base+mesh.NXID];
+          dfloat ny = mesh.sgeo[base+mesh.NYID];
+          dfloat nz = mesh.sgeo[base+mesh.NZID];
+          dfloat wsJ = mesh.sgeo[base+mesh.WSJID];
+          dfloat hinv = mesh.sgeo[base+mesh.IHID];
 
           // form negative trace terms in IPDG
           int idnM = n*mesh.Np+vidM;
@@ -1006,11 +993,9 @@ void elliptic_t::BuildOperatorDiagonalIpdgHex3D(dfloat *A) {
       }
     }
   }
-
-  free(B); free(Br); free(Bs); free(Bt);
 }
 
-void elliptic_t::BuildOperatorDiagonalContinuousHex3D(dfloat *A) {
+void elliptic_t::BuildOperatorDiagonalContinuousHex3D(memory<dfloat>& A) {
 
   for(dlong eM=0;eM<mesh.Nelements;++eM){
     for (int nz=0;nz<mesh.Nq;nz++) {
@@ -1024,34 +1009,34 @@ void elliptic_t::BuildOperatorDiagonalContinuousHex3D(dfloat *A) {
         dlong base = eM*mesh.Np*mesh.Nggeo;
 
 
-        dfloat Grs = mesh.ggeo[base + id + G01ID*mesh.Np];
+        dfloat Grs = mesh.ggeo[base + id + mesh.G01ID*mesh.Np];
         A[eM*mesh.Np+idn] += 2*Grs*mesh.D[nx+nx*mesh.Nq]*mesh.D[ny+ny*mesh.Nq];
 
-        dfloat Grt = mesh.ggeo[base + id + G02ID*mesh.Np];
+        dfloat Grt = mesh.ggeo[base + id + mesh.G02ID*mesh.Np];
         A[eM*mesh.Np+idn] += 2*Grt*mesh.D[nx+nx*mesh.Nq]*mesh.D[nz+nz*mesh.Nq];
 
-        dfloat Gst = mesh.ggeo[base + id + G12ID*mesh.Np];
+        dfloat Gst = mesh.ggeo[base + id + mesh.G12ID*mesh.Np];
         A[eM*mesh.Np+idn] += 2*Gst*mesh.D[ny+ny*mesh.Nq]*mesh.D[nz+nz*mesh.Nq];
 
         for (int k=0;k<mesh.Nq;k++) {
           int iid = k+ny*mesh.Nq+nz*mesh.Nq*mesh.Nq;
-          dfloat Grr = mesh.ggeo[base + iid + G00ID*mesh.Np];
+          dfloat Grr = mesh.ggeo[base + iid + mesh.G00ID*mesh.Np];
           A[eM*mesh.Np+idn] += Grr*mesh.D[nx+k*mesh.Nq]*mesh.D[nx+k*mesh.Nq];
         }
 
         for (int k=0;k<mesh.Nq;k++) {
           int iid = nx+k*mesh.Nq+nz*mesh.Nq*mesh.Nq;
-          dfloat Gss = mesh.ggeo[base + iid + G11ID*mesh.Np];
+          dfloat Gss = mesh.ggeo[base + iid + mesh.G11ID*mesh.Np];
           A[eM*mesh.Np+idn] += Gss*mesh.D[ny+k*mesh.Nq]*mesh.D[ny+k*mesh.Nq];
         }
 
         for (int k=0;k<mesh.Nq;k++) {
           int iid = nx+ny*mesh.Nq+k*mesh.Nq*mesh.Nq;
-          dfloat Gtt = mesh.ggeo[base + iid + G22ID*mesh.Np];
+          dfloat Gtt = mesh.ggeo[base + iid + mesh.G22ID*mesh.Np];
           A[eM*mesh.Np+idn] += Gtt*mesh.D[nz+k*mesh.Nq]*mesh.D[nz+k*mesh.Nq];
         }
 
-        dfloat JW = mesh.ggeo[base + id + GWJID*mesh.Np];
+        dfloat JW = mesh.wJ[eM*mesh.Np + idn];
         A[eM*mesh.Np+idn] += JW*lambda;
       } else {
         A[eM*mesh.Np+idn] = 1; //just put a 1 so A is invertable
diff --git a/solvers/elliptic/src/ellipticBuildOperatorMatrixContinuous.cpp b/solvers/elliptic/src/ellipticBuildOperatorMatrixContinuous.cpp
index 2286bb4f2..d629bb9ae 100644
--- a/solvers/elliptic/src/ellipticBuildOperatorMatrixContinuous.cpp
+++ b/solvers/elliptic/src/ellipticBuildOperatorMatrixContinuous.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,15 +25,20 @@ SOFTWARE.
 */
 
 #include "elliptic.hpp"
-#include "mesh/meshDefines2D.h"
-#include "mesh/meshDefines3D.h"
+
+#ifdef GLIBCXX_PARALLEL
+#include <parallel/algorithm>
+using __gnu_parallel::sort;
+#else
+using std::sort;
+#endif
 
 void elliptic_t::BuildOperatorMatrixContinuous(parAlmond::parCOO& A) {
 
   switch(mesh.elementType){
-  case TRIANGLES:
+  case Mesh::TRIANGLES:
     BuildOperatorMatrixContinuousTri2D(A); break;
-  case QUADRILATERALS:
+  case Mesh::QUADRILATERALS:
   {
     if(mesh.dim==2)
       BuildOperatorMatrixContinuousQuad2D(A);
@@ -42,9 +47,9 @@ void elliptic_t::BuildOperatorMatrixContinuous(parAlmond::parCOO& A) {
 
     break;
   }
-  case TETRAHEDRA:
+  case Mesh::TETRAHEDRA:
     BuildOperatorMatrixContinuousTet3D(A); break;
-  case HEXAHEDRA:
+  case Mesh::HEXAHEDRA:
     BuildOperatorMatrixContinuousHex3D(A); break;
   }
 }
@@ -52,12 +57,12 @@ void elliptic_t::BuildOperatorMatrixContinuous(parAlmond::parCOO& A) {
 void elliptic_t::BuildOperatorMatrixContinuousTri2D(parAlmond::parCOO& A) {
 
   // number of degrees of freedom on this rank (after gathering)
-  hlong Ngather = ogsMasked->Ngather;
+  hlong Ngather = ogsMasked.Ngather;
 
   // every gathered degree of freedom has its own global id
-  A.globalRowStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong));
-  A.globalColStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong));
-  MPI_Allgather(&Ngather, 1, MPI_HLONG, A.globalRowStarts+1, 1, MPI_HLONG, mesh.comm);
+  A.globalRowStarts.malloc(mesh.size+1, 0);
+  A.globalColStarts.malloc(mesh.size+1, 0);
+  mesh.comm.Allgather(Ngather, A.globalRowStarts+1);
   for(int r=0;r<mesh.size;++r) {
     A.globalRowStarts[r+1] = A.globalRowStarts[r]+A.globalRowStarts[r+1];
     A.globalColStarts[r+1] = A.globalRowStarts[r+1];
@@ -66,26 +71,26 @@ void elliptic_t::BuildOperatorMatrixContinuousTri2D(parAlmond::parCOO& A) {
   // Build non-zeros of stiffness matrix (unassembled)
   dlong nnzLocal = mesh.Np*mesh.Np*mesh.Nelements;
 
-  parAlmond::parCOO::nonZero_t *sendNonZeros = (parAlmond::parCOO::nonZero_t*) calloc(nnzLocal, sizeof(parAlmond::parCOO::nonZero_t));
-  int *AsendCounts  = (int*) calloc(mesh.size, sizeof(int));
-  int *ArecvCounts  = (int*) calloc(mesh.size, sizeof(int));
-  int *AsendOffsets = (int*) calloc(mesh.size+1, sizeof(int));
-  int *ArecvOffsets = (int*) calloc(mesh.size+1, sizeof(int));
+  memory<parAlmond::parCOO::nonZero_t> sendNonZeros(nnzLocal);
+  memory<int> AsendCounts (mesh.size, 0);
+  memory<int> ArecvCounts (mesh.size);
+  memory<int> AsendOffsets(mesh.size+1);
+  memory<int> ArecvOffsets(mesh.size+1);
 
-  dfloat *Srr = mesh.Srr;
-  dfloat *Srs = mesh.Srs;
-  dfloat *Sss = mesh.Sss;
-  dfloat *MM  = mesh.MM ;
+  memory<dfloat> Srr = mesh.Srr;
+  memory<dfloat> Srs = mesh.Srs;
+  memory<dfloat> Sss = mesh.Sss;
+  memory<dfloat> MM  = mesh.MM ;
 
-  if(mesh.rank==0) {printf("Building full FEM matrix...");fflush(stdout);}
+  if(Comm::World().rank()==0) {printf("Building full FEM matrix...");fflush(stdout);}
 
   //Build unassembed non-zeros
   dlong cnt =0;
   for (dlong e=0;e<mesh.Nelements;e++) {
-    dfloat Grr = mesh.ggeo[e*mesh.Nggeo + G00ID];
-    dfloat Grs = mesh.ggeo[e*mesh.Nggeo + G01ID];
-    dfloat Gss = mesh.ggeo[e*mesh.Nggeo + G11ID];
-    dfloat J   = mesh.ggeo[e*mesh.Nggeo + GWJID];
+    dfloat Grr = mesh.ggeo[e*mesh.Nggeo + mesh.G00ID];
+    dfloat Grs = mesh.ggeo[e*mesh.Nggeo + mesh.G01ID];
+    dfloat Gss = mesh.ggeo[e*mesh.Nggeo + mesh.G11ID];
+    dfloat J   = mesh.wJ[e];
 
     for (int n=0;n<mesh.Np;n++) {
       if (maskedGlobalNumbering[e*mesh.Np + n]<0) continue; //skip masked nodes
@@ -112,14 +117,14 @@ void elliptic_t::BuildOperatorMatrixContinuousTri2D(parAlmond::parCOO& A) {
   }
 
   // sort by row ordering
-  std::sort(sendNonZeros, sendNonZeros+cnt,
-            [](const parAlmond::parCOO::nonZero_t& a,
-               const parAlmond::parCOO::nonZero_t& b) {
-              if (a.row < b.row) return true;
-              if (a.row > b.row) return false;
+  sort(sendNonZeros.ptr(), sendNonZeros.ptr()+cnt,
+       [](const parAlmond::parCOO::nonZero_t& a,
+          const parAlmond::parCOO::nonZero_t& b) {
+         if (a.row < b.row) return true;
+         if (a.row > b.row) return false;
 
-              return a.col < b.col;
-            });
+         return a.col < b.col;
+        });
 
   // count how many non-zeros to send to each process
   int rr=0;
@@ -130,32 +135,33 @@ void elliptic_t::BuildOperatorMatrixContinuousTri2D(parAlmond::parCOO& A) {
   }
 
   // find how many nodes to expect (should use sparse version)
-  MPI_Alltoall(AsendCounts, 1, MPI_INT, ArecvCounts, 1, MPI_INT, mesh.comm);
+  mesh.comm.Alltoall(AsendCounts, ArecvCounts);
 
   // find send and recv offsets for gather
   A.nnz = 0;
+  AsendOffsets[0] = 0;
+  ArecvOffsets[0] = 0;
   for(int r=0;r<mesh.size;++r){
     AsendOffsets[r+1] = AsendOffsets[r] + AsendCounts[r];
     ArecvOffsets[r+1] = ArecvOffsets[r] + ArecvCounts[r];
     A.nnz += ArecvCounts[r];
   }
 
-  A.entries = (parAlmond::parCOO::nonZero_t*) calloc(A.nnz, sizeof(parAlmond::parCOO::nonZero_t));
+  A.entries.malloc(A.nnz);
 
   // determine number to receive
-  MPI_Alltoallv(sendNonZeros, AsendCounts, AsendOffsets, parAlmond::MPI_NONZERO_T,
-                   A.entries, ArecvCounts, ArecvOffsets, parAlmond::MPI_NONZERO_T,
-                   mesh.comm);
+  mesh.comm.Alltoallv(sendNonZeros, AsendCounts, AsendOffsets,
+                      A.entries,    ArecvCounts, ArecvOffsets);
 
   // sort received non-zero entries by row block (may need to switch compareRowColumn tests)
-  std::sort(A.entries, A.entries+A.nnz,
-            [](const parAlmond::parCOO::nonZero_t& a,
-               const parAlmond::parCOO::nonZero_t& b) {
-              if (a.row < b.row) return true;
-              if (a.row > b.row) return false;
+  sort(A.entries.ptr(), A.entries.ptr()+A.nnz,
+       [](const parAlmond::parCOO::nonZero_t& a,
+          const parAlmond::parCOO::nonZero_t& b) {
+         if (a.row < b.row) return true;
+         if (a.row > b.row) return false;
 
-              return a.col < b.col;
-            });
+         return a.col < b.col;
+       });
 
   // compress duplicates
   cnt = 0;
@@ -172,26 +178,19 @@ void elliptic_t::BuildOperatorMatrixContinuousTri2D(parAlmond::parCOO& A) {
   if (A.nnz) cnt++;
   A.nnz = cnt;
 
-  if(mesh.rank==0) printf("done.\n");
-
-  MPI_Barrier(mesh.comm);
-  free(sendNonZeros);
-  free(AsendCounts);
-  free(ArecvCounts);
-  free(AsendOffsets);
-  free(ArecvOffsets);
+  if(Comm::World().rank()==0) printf("done.\n");
 }
 
 
 void elliptic_t::BuildOperatorMatrixContinuousQuad3D(parAlmond::parCOO& A) {
 
   // number of degrees of freedom on this rank (after gathering)
-  hlong Ngather = ogsMasked->Ngather;
+  hlong Ngather = ogsMasked.Ngather;
 
   // every gathered degree of freedom has its own global id
-  A.globalRowStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong));
-  A.globalColStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong));
-  MPI_Allgather(&Ngather, 1, MPI_HLONG, A.globalRowStarts+1, 1, MPI_HLONG, mesh.comm);
+  A.globalRowStarts.malloc(mesh.size+1, 0);
+  A.globalColStarts.malloc(mesh.size+1, 0);
+  mesh.comm.Allgather(Ngather, A.globalRowStarts+1);
   for(int r=0;r<mesh.size;++r) {
     A.globalRowStarts[r+1] = A.globalRowStarts[r]+A.globalRowStarts[r+1];
     A.globalColStarts[r+1] = A.globalRowStarts[r+1];
@@ -199,13 +198,13 @@ void elliptic_t::BuildOperatorMatrixContinuousQuad3D(parAlmond::parCOO& A) {
 
   // 2. Build non-zeros of stiffness matrix (unassembled)
   dlong nnzLocal = mesh.Np*mesh.Np*mesh.Nelements;
-  parAlmond::parCOO::nonZero_t *sendNonZeros = (parAlmond::parCOO::nonZero_t*) calloc(nnzLocal, sizeof(parAlmond::parCOO::nonZero_t));
-  int *AsendCounts  = (int*) calloc(mesh.size, sizeof(int));
-  int *ArecvCounts  = (int*) calloc(mesh.size, sizeof(int));
-  int *AsendOffsets = (int*) calloc(mesh.size+1, sizeof(int));
-  int *ArecvOffsets = (int*) calloc(mesh.size+1, sizeof(int));
+  memory<parAlmond::parCOO::nonZero_t> sendNonZeros(nnzLocal);
+  memory<int> AsendCounts (mesh.size, 0);
+  memory<int> ArecvCounts (mesh.size);
+  memory<int> AsendOffsets(mesh.size+1);
+  memory<int> ArecvOffsets(mesh.size+1);
 
-  if(mesh.rank==0) {printf("Building full FEM matrix...");fflush(stdout);}
+  if(Comm::World().rank()==0) {printf("Building full FEM matrix...");fflush(stdout);}
 
 #if 0
   hlong NTf = mesh.Nelements*mesh.Np * mesh.Nelements*mesh.Np ;
@@ -228,34 +227,34 @@ void elliptic_t::BuildOperatorMatrixContinuousQuad3D(parAlmond::parCOO& A) {
             if (ny==my) {
               for (int k=0;k<mesh.Nq;k++) {
                 id = k+ny*mesh.Nq;
-                dfloat Grr = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + G00ID*mesh.Np];
+                dfloat Grr = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + mesh.G00ID*mesh.Np];
 
                 val += Grr*mesh.D[nx+k*mesh.Nq]*mesh.D[mx+k*mesh.Nq];
               }
             }
 
             id = mx+ny*mesh.Nq;
-            dfloat Grs = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + G01ID*mesh.Np];
+            dfloat Grs = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + mesh.G01ID*mesh.Np];
             val += Grs*mesh.D[nx+mx*mesh.Nq]*mesh.D[my+ny*mesh.Nq];
 
             id = nx+my*mesh.Nq;
-            dfloat Gsr = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + G01ID*mesh.Np];
+            dfloat Gsr = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + mesh.G01ID*mesh.Np];
             val += Gsr*mesh.D[mx+nx*mesh.Nq]*mesh.D[ny+my*mesh.Nq];
 
 
             // id = mx+ny*mesh.Nq;
-            // dfloat Grt = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + G02ID*mesh.Np];
+            // dfloat Grt = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + mesh.G02ID*mesh.Np];
             // val += Grt*mesh.D[nx+mx*mesh.Nq];
 
             // id = nx+my*mesh.Nq;
-            // dfloat Gtr = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + G02ID*mesh.Np];
+            // dfloat Gtr = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + mesh.G02ID*mesh.Np];
             // val += Gtr*mesh.D[mx+nx*mesh.Nq];
 
 
             if (nx==mx) {
               for (int k=0;k<mesh.Nq;k++) {
                 id = nx+k*mesh.Nq;
-                dfloat Gss = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + G11ID*mesh.Np];
+                dfloat Gss = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + mesh.G11ID*mesh.Np];
 
                 val += Gss*mesh.D[ny+k*mesh.Nq]*mesh.D[my+k*mesh.Nq];
               }
@@ -263,21 +262,21 @@ void elliptic_t::BuildOperatorMatrixContinuousQuad3D(parAlmond::parCOO& A) {
 
             // double check following two: AK
             // id = nx+my*mesh.Nq;
-            // dfloat Gst = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + G12ID*mesh.Np];
+            // dfloat Gst = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + mesh.G12ID*mesh.Np];
             // val += Gst*mesh.D[ny+my*mesh.Nq];
 
             // id = mx+ny*mesh.Nq;
-            // dfloat Gts = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + G12ID*mesh.Np];
+            // dfloat Gts = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + mesh.G12ID*mesh.Np];
             // val += Gts*mesh.D[my+ny*mesh.Nq];
 
 
             if ((nx==mx)&&(ny==my)) {
               id = nx + ny*mesh.Nq;
 
-              // dfloat Gtt = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + G22ID*mesh.Np];
+              // dfloat Gtt = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + mesh.G22ID*mesh.Np];
               // val += Gtt;
 
-              dfloat JW = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + GWJID*mesh.Np];
+              dfloat JW = mesh.wJ[e*mesh.Np + id];
               val += JW*lambda;
             }
 
@@ -321,14 +320,14 @@ void elliptic_t::BuildOperatorMatrixContinuousQuad3D(parAlmond::parCOO& A) {
 #endif
 
   // sort by row ordering
-  std::sort(sendNonZeros, sendNonZeros+cnt,
-            [](const parAlmond::parCOO::nonZero_t& a,
-               const parAlmond::parCOO::nonZero_t& b) {
-              if (a.row < b.row) return true;
-              if (a.row > b.row) return false;
+  sort(sendNonZeros.ptr(), sendNonZeros.ptr()+cnt,
+        [](const parAlmond::parCOO::nonZero_t& a,
+           const parAlmond::parCOO::nonZero_t& b) {
+          if (a.row < b.row) return true;
+          if (a.row > b.row) return false;
 
-              return a.col < b.col;
-            });
+          return a.col < b.col;
+        });
 
   // count how many non-zeros to send to each process
   int rr=0;
@@ -339,32 +338,33 @@ void elliptic_t::BuildOperatorMatrixContinuousQuad3D(parAlmond::parCOO& A) {
   }
 
   // find how many nodes to expect (should use sparse version)
-  MPI_Alltoall(AsendCounts, 1, MPI_INT, ArecvCounts, 1, MPI_INT, mesh.comm);
+  mesh.comm.Alltoall(AsendCounts, ArecvCounts);
 
   // find send and recv offsets for gather
   A.nnz = 0;
+  AsendOffsets[0] = 0;
+  ArecvOffsets[0] = 0;
   for(int r=0;r<mesh.size;++r){
     AsendOffsets[r+1] = AsendOffsets[r] + AsendCounts[r];
     ArecvOffsets[r+1] = ArecvOffsets[r] + ArecvCounts[r];
     A.nnz += ArecvCounts[r];
   }
 
-  A.entries = (parAlmond::parCOO::nonZero_t*) calloc(A.nnz, sizeof(parAlmond::parCOO::nonZero_t));
+  A.entries.malloc(A.nnz);
 
   // determine number to receive
-  MPI_Alltoallv(sendNonZeros, AsendCounts, AsendOffsets, parAlmond::MPI_NONZERO_T,
-                   A.entries, ArecvCounts, ArecvOffsets, parAlmond::MPI_NONZERO_T,
-                   mesh.comm);
+  mesh.comm.Alltoallv(sendNonZeros, AsendCounts, AsendOffsets,
+                      A.entries,    ArecvCounts, ArecvOffsets);
 
   // sort received non-zero entries by row block (may need to switch compareRowColumn tests)
-  std::sort(A.entries, A.entries+A.nnz,
-            [](const parAlmond::parCOO::nonZero_t& a,
-               const parAlmond::parCOO::nonZero_t& b) {
-              if (a.row < b.row) return true;
-              if (a.row > b.row) return false;
+  sort(A.entries.ptr(), A.entries.ptr()+A.nnz,
+        [](const parAlmond::parCOO::nonZero_t& a,
+           const parAlmond::parCOO::nonZero_t& b) {
+          if (a.row < b.row) return true;
+          if (a.row > b.row) return false;
 
-              return a.col < b.col;
-            });
+          return a.col < b.col;
+        });
 
   // compress duplicates
   cnt = 0;
@@ -395,26 +395,19 @@ void elliptic_t::BuildOperatorMatrixContinuousQuad3D(parAlmond::parCOO& A) {
  fclose(fp);
 #endif
 
-  if(mesh.rank==0) printf("done.\n");
-
-  MPI_Barrier(mesh.comm);
-  free(sendNonZeros);
-  free(AsendCounts);
-  free(ArecvCounts);
-  free(AsendOffsets);
-  free(ArecvOffsets);
+  if(Comm::World().rank()==0) printf("done.\n");
 }
 
 
 void elliptic_t::BuildOperatorMatrixContinuousQuad2D(parAlmond::parCOO& A) {
 
   // number of degrees of freedom on this rank (after gathering)
-  hlong Ngather = ogsMasked->Ngather;
+  hlong Ngather = ogsMasked.Ngather;
 
   // every gathered degree of freedom has its own global id
-  A.globalRowStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong));
-  A.globalColStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong));
-  MPI_Allgather(&Ngather, 1, MPI_HLONG, A.globalRowStarts+1, 1, MPI_HLONG, mesh.comm);
+  A.globalRowStarts.malloc(mesh.size+1,0);
+  A.globalColStarts.malloc(mesh.size+1,0);
+  mesh.comm.Allgather(Ngather, A.globalRowStarts+1);
   for(int r=0;r<mesh.size;++r) {
     A.globalRowStarts[r+1] = A.globalRowStarts[r]+A.globalRowStarts[r+1];
     A.globalColStarts[r+1] = A.globalRowStarts[r+1];
@@ -422,13 +415,13 @@ void elliptic_t::BuildOperatorMatrixContinuousQuad2D(parAlmond::parCOO& A) {
 
   // 2. Build non-zeros of stiffness matrix (unassembled)
   dlong nnzLocal = mesh.Np*mesh.Np*mesh.Nelements;
-  parAlmond::parCOO::nonZero_t *sendNonZeros = (parAlmond::parCOO::nonZero_t*) calloc(nnzLocal, sizeof(parAlmond::parCOO::nonZero_t));
-  int *AsendCounts  = (int*) calloc(mesh.size, sizeof(int));
-  int *ArecvCounts  = (int*) calloc(mesh.size, sizeof(int));
-  int *AsendOffsets = (int*) calloc(mesh.size+1, sizeof(int));
-  int *ArecvOffsets = (int*) calloc(mesh.size+1, sizeof(int));
+  memory<parAlmond::parCOO::nonZero_t> sendNonZeros(nnzLocal);
+  memory<int> AsendCounts (mesh.size, 0);
+  memory<int> ArecvCounts (mesh.size);
+  memory<int> AsendOffsets(mesh.size+1);
+  memory<int> ArecvOffsets(mesh.size+1);
 
-  if(mesh.rank==0) {printf("Building full FEM matrix...");fflush(stdout);}
+  if(Comm::World().rank()==0) {printf("Building full FEM matrix...");fflush(stdout);}
 
   //Build unassembed non-zeros
   dlong cnt =0;
@@ -446,25 +439,25 @@ void elliptic_t::BuildOperatorMatrixContinuousQuad2D(parAlmond::parCOO& A) {
             if (ny==my) {
               for (int k=0;k<mesh.Nq;k++) {
                 id = k+ny*mesh.Nq;
-                dfloat Grr = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + G00ID*mesh.Np];
+                dfloat Grr = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + mesh.G00ID*mesh.Np];
 
                 val += Grr*mesh.D[nx+k*mesh.Nq]*mesh.D[mx+k*mesh.Nq];
               }
             }
 
             id = mx+ny*mesh.Nq;
-            dfloat Grs = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + G01ID*mesh.Np];
+            dfloat Grs = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + mesh.G01ID*mesh.Np];
             val += Grs*mesh.D[nx+mx*mesh.Nq]*mesh.D[my+ny*mesh.Nq];
 
 
             id = nx+my*mesh.Nq;
-            dfloat Gsr = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + G01ID*mesh.Np];
+            dfloat Gsr = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + mesh.G01ID*mesh.Np];
             val += Gsr*mesh.D[mx+nx*mesh.Nq]*mesh.D[ny+my*mesh.Nq];
 
             if (nx==mx) {
               for (int k=0;k<mesh.Nq;k++) {
                 id = nx+k*mesh.Nq;
-                dfloat Gss = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + G11ID*mesh.Np];
+                dfloat Gss = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + mesh.G11ID*mesh.Np];
 
                 val += Gss*mesh.D[ny+k*mesh.Nq]*mesh.D[my+k*mesh.Nq];
               }
@@ -472,7 +465,7 @@ void elliptic_t::BuildOperatorMatrixContinuousQuad2D(parAlmond::parCOO& A) {
 
             if ((nx==mx)&&(ny==my)) {
               id = nx + ny*mesh.Nq;
-              dfloat JW = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + GWJID*mesh.Np];
+              dfloat JW = mesh.wJ[e*mesh.Np + id];
               val += JW*lambda;
             }
 
@@ -491,14 +484,14 @@ void elliptic_t::BuildOperatorMatrixContinuousQuad2D(parAlmond::parCOO& A) {
   }
 
   // sort by row ordering
-  std::sort(sendNonZeros, sendNonZeros+cnt,
-            [](const parAlmond::parCOO::nonZero_t& a,
-               const parAlmond::parCOO::nonZero_t& b) {
-              if (a.row < b.row) return true;
-              if (a.row > b.row) return false;
+  sort(sendNonZeros.ptr(), sendNonZeros.ptr()+cnt,
+      [](const parAlmond::parCOO::nonZero_t& a,
+         const parAlmond::parCOO::nonZero_t& b) {
+        if (a.row < b.row) return true;
+        if (a.row > b.row) return false;
 
-              return a.col < b.col;
-            });
+        return a.col < b.col;
+      });
 
   // count how many non-zeros to send to each process
   int rr=0;
@@ -509,32 +502,33 @@ void elliptic_t::BuildOperatorMatrixContinuousQuad2D(parAlmond::parCOO& A) {
   }
 
   // find how many nodes to expect (should use sparse version)
-  MPI_Alltoall(AsendCounts, 1, MPI_INT, ArecvCounts, 1, MPI_INT, mesh.comm);
+  mesh.comm.Alltoall(AsendCounts, ArecvCounts);
 
   // find send and recv offsets for gather
   A.nnz = 0;
+  AsendOffsets[0] = 0;
+  ArecvOffsets[0] = 0;
   for(int r=0;r<mesh.size;++r){
     AsendOffsets[r+1] = AsendOffsets[r] + AsendCounts[r];
     ArecvOffsets[r+1] = ArecvOffsets[r] + ArecvCounts[r];
     A.nnz += ArecvCounts[r];
   }
 
-  A.entries = (parAlmond::parCOO::nonZero_t*) calloc(A.nnz, sizeof(parAlmond::parCOO::nonZero_t));
+  A.entries.malloc(A.nnz);
 
   // determine number to receive
-  MPI_Alltoallv(sendNonZeros, AsendCounts, AsendOffsets, parAlmond::MPI_NONZERO_T,
-                   A.entries, ArecvCounts, ArecvOffsets, parAlmond::MPI_NONZERO_T,
-                   mesh.comm);
+  mesh.comm.Alltoallv(sendNonZeros, AsendCounts, AsendOffsets,
+                      A.entries,    ArecvCounts, ArecvOffsets);
 
   // sort received non-zero entries by row block (may need to switch compareRowColumn tests)
-  std::sort(A.entries, A.entries+A.nnz,
-            [](const parAlmond::parCOO::nonZero_t& a,
-               const parAlmond::parCOO::nonZero_t& b) {
-              if (a.row < b.row) return true;
-              if (a.row > b.row) return false;
+  sort(A.entries.ptr(), A.entries.ptr()+A.nnz,
+      [](const parAlmond::parCOO::nonZero_t& a,
+         const parAlmond::parCOO::nonZero_t& b) {
+        if (a.row < b.row) return true;
+        if (a.row > b.row) return false;
 
-              return a.col < b.col;
-            });
+        return a.col < b.col;
+      });
 
   // compress duplicates
   cnt = 0;
@@ -565,25 +559,18 @@ void elliptic_t::BuildOperatorMatrixContinuousQuad2D(parAlmond::parCOO& A) {
  fclose(fp);
 #endif
 
-  if(mesh.rank==0) printf("done.\n");
-
-  MPI_Barrier(mesh.comm);
-  free(sendNonZeros);
-  free(AsendCounts);
-  free(ArecvCounts);
-  free(AsendOffsets);
-  free(ArecvOffsets);
+  if(Comm::World().rank()==0) printf("done.\n");
 }
 
 void elliptic_t::BuildOperatorMatrixContinuousTet3D(parAlmond::parCOO& A) {
 
   // number of degrees of freedom on this rank (after gathering)
-  hlong Ngather = ogsMasked->Ngather;
+  hlong Ngather = ogsMasked.Ngather;
 
   // every gathered degree of freedom has its own global id
-  A.globalRowStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong));
-  A.globalColStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong));
-  MPI_Allgather(&Ngather, 1, MPI_HLONG, A.globalRowStarts+1, 1, MPI_HLONG, mesh.comm);
+  A.globalRowStarts.malloc(mesh.size+1,0);
+  A.globalColStarts.malloc(mesh.size+1,0);
+  mesh.comm.Allgather(Ngather, A.globalRowStarts+1);
   for(int r=0;r<mesh.size;++r) {
     A.globalRowStarts[r+1] = A.globalRowStarts[r]+A.globalRowStarts[r+1];
     A.globalColStarts[r+1] = A.globalRowStarts[r+1];
@@ -592,26 +579,26 @@ void elliptic_t::BuildOperatorMatrixContinuousTet3D(parAlmond::parCOO& A) {
   // Build non-zeros of stiffness matrix (unassembled)
   dlong nnzLocal = mesh.Np*mesh.Np*mesh.Nelements;
 
-  parAlmond::parCOO::nonZero_t *sendNonZeros = (parAlmond::parCOO::nonZero_t*) calloc(nnzLocal, sizeof(parAlmond::parCOO::nonZero_t));
-  int *AsendCounts  = (int*) calloc(mesh.size, sizeof(int));
-  int *ArecvCounts  = (int*) calloc(mesh.size, sizeof(int));
-  int *AsendOffsets = (int*) calloc(mesh.size+1, sizeof(int));
-  int *ArecvOffsets = (int*) calloc(mesh.size+1, sizeof(int));
+  memory<parAlmond::parCOO::nonZero_t> sendNonZeros(nnzLocal);
+  memory<int> AsendCounts (mesh.size, 0);
+  memory<int> ArecvCounts (mesh.size);
+  memory<int> AsendOffsets(mesh.size+1);
+  memory<int> ArecvOffsets(mesh.size+1);
 
   //Build unassembed non-zeros
-  if(mesh.rank==0) {printf("Building full FEM matrix...");fflush(stdout);}
+  if(Comm::World().rank()==0) {printf("Building full FEM matrix...");fflush(stdout);}
 
   dlong cnt =0;
   //#pragma omp parallel for
   for (dlong e=0;e<mesh.Nelements;e++) {
 
-    dfloat Grr = mesh.ggeo[e*mesh.Nggeo + G00ID];
-    dfloat Grs = mesh.ggeo[e*mesh.Nggeo + G01ID];
-    dfloat Grt = mesh.ggeo[e*mesh.Nggeo + G02ID];
-    dfloat Gss = mesh.ggeo[e*mesh.Nggeo + G11ID];
-    dfloat Gst = mesh.ggeo[e*mesh.Nggeo + G12ID];
-    dfloat Gtt = mesh.ggeo[e*mesh.Nggeo + G22ID];
-    dfloat J   = mesh.ggeo[e*mesh.Nggeo + GWJID];
+    dfloat Grr = mesh.ggeo[e*mesh.Nggeo + mesh.G00ID];
+    dfloat Grs = mesh.ggeo[e*mesh.Nggeo + mesh.G01ID];
+    dfloat Grt = mesh.ggeo[e*mesh.Nggeo + mesh.G02ID];
+    dfloat Gss = mesh.ggeo[e*mesh.Nggeo + mesh.G11ID];
+    dfloat Gst = mesh.ggeo[e*mesh.Nggeo + mesh.G12ID];
+    dfloat Gtt = mesh.ggeo[e*mesh.Nggeo + mesh.G22ID];
+    dfloat J   = mesh.wJ[e];
 
     for (int n=0;n<mesh.Np;n++) {
       if (maskedGlobalNumbering[e*mesh.Np + n]<0) continue; //skip masked nodes
@@ -643,14 +630,14 @@ void elliptic_t::BuildOperatorMatrixContinuousTet3D(parAlmond::parCOO& A) {
   }
 
   // sort by row ordering
-  std::sort(sendNonZeros, sendNonZeros+cnt,
-            [](const parAlmond::parCOO::nonZero_t& a,
-               const parAlmond::parCOO::nonZero_t& b) {
-              if (a.row < b.row) return true;
-              if (a.row > b.row) return false;
+  sort(sendNonZeros.ptr(), sendNonZeros.ptr()+cnt,
+      [](const parAlmond::parCOO::nonZero_t& a,
+         const parAlmond::parCOO::nonZero_t& b) {
+        if (a.row < b.row) return true;
+        if (a.row > b.row) return false;
 
-              return a.col < b.col;
-            });
+        return a.col < b.col;
+      });
 
   // count how many non-zeros to send to each process
   int rr=0;
@@ -661,32 +648,33 @@ void elliptic_t::BuildOperatorMatrixContinuousTet3D(parAlmond::parCOO& A) {
   }
 
   // find how many nodes to expect (should use sparse version)
-  MPI_Alltoall(AsendCounts, 1, MPI_INT, ArecvCounts, 1, MPI_INT, mesh.comm);
+  mesh.comm.Alltoall(AsendCounts, ArecvCounts);
 
   // find send and recv offsets for gather
   A.nnz = 0;
+  AsendOffsets[0] = 0;
+  ArecvOffsets[0] = 0;
   for(int r=0;r<mesh.size;++r){
     AsendOffsets[r+1] = AsendOffsets[r] + AsendCounts[r];
     ArecvOffsets[r+1] = ArecvOffsets[r] + ArecvCounts[r];
     A.nnz += ArecvCounts[r];
   }
 
-  A.entries = (parAlmond::parCOO::nonZero_t*) calloc(A.nnz, sizeof(parAlmond::parCOO::nonZero_t));
+  A.entries.malloc(A.nnz);
 
   // determine number to receive
-  MPI_Alltoallv(sendNonZeros, AsendCounts, AsendOffsets, parAlmond::MPI_NONZERO_T,
-                   A.entries, ArecvCounts, ArecvOffsets, parAlmond::MPI_NONZERO_T,
-                   mesh.comm);
+  mesh.comm.Alltoallv(sendNonZeros, AsendCounts, AsendOffsets,
+                      A.entries,    ArecvCounts, ArecvOffsets);
 
   // sort received non-zero entries by row block (may need to switch compareRowColumn tests)
-  std::sort(A.entries, A.entries+A.nnz,
-            [](const parAlmond::parCOO::nonZero_t& a,
-               const parAlmond::parCOO::nonZero_t& b) {
-              if (a.row < b.row) return true;
-              if (a.row > b.row) return false;
+  sort(A.entries.ptr(), A.entries.ptr()+A.nnz,
+      [](const parAlmond::parCOO::nonZero_t& a,
+         const parAlmond::parCOO::nonZero_t& b) {
+        if (a.row < b.row) return true;
+        if (a.row > b.row) return false;
 
-              return a.col < b.col;
-            });
+        return a.col < b.col;
+      });
 
   // compress duplicates
   cnt = 0;
@@ -703,25 +691,18 @@ void elliptic_t::BuildOperatorMatrixContinuousTet3D(parAlmond::parCOO& A) {
   if (A.nnz) cnt++;
   A.nnz = cnt;
 
-  if(mesh.rank==0) printf("done.\n");
-
-  MPI_Barrier(mesh.comm);
-  free(sendNonZeros);
-  free(AsendCounts);
-  free(ArecvCounts);
-  free(AsendOffsets);
-  free(ArecvOffsets);
+  if(Comm::World().rank()==0) printf("done.\n");
 }
 
 void elliptic_t::BuildOperatorMatrixContinuousHex3D(parAlmond::parCOO& A) {
 
   // number of degrees of freedom on this rank (after gathering)
-  hlong Ngather = ogsMasked->Ngather;
+  hlong Ngather = ogsMasked.Ngather;
 
   // every gathered degree of freedom has its own global id
-  A.globalRowStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong));
-  A.globalColStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong));
-  MPI_Allgather(&Ngather, 1, MPI_HLONG, A.globalRowStarts+1, 1, MPI_HLONG, mesh.comm);
+  A.globalRowStarts.malloc(mesh.size+1,0);
+  A.globalColStarts.malloc(mesh.size+1,0);
+  mesh.comm.Allgather(Ngather, A.globalRowStarts+1);
   for(int r=0;r<mesh.size;++r) {
     A.globalRowStarts[r+1] = A.globalRowStarts[r]+A.globalRowStarts[r+1];
     A.globalColStarts[r+1] = A.globalRowStarts[r+1];
@@ -729,13 +710,13 @@ void elliptic_t::BuildOperatorMatrixContinuousHex3D(parAlmond::parCOO& A) {
 
   // 2. Build non-zeros of stiffness matrix (unassembled)
   dlong nnzLocal = mesh.Np*mesh.Np*mesh.Nelements;
-  parAlmond::parCOO::nonZero_t *sendNonZeros = (parAlmond::parCOO::nonZero_t*) calloc(nnzLocal, sizeof(parAlmond::parCOO::nonZero_t));
-  int *AsendCounts  = (int*) calloc(mesh.size, sizeof(int));
-  int *ArecvCounts  = (int*) calloc(mesh.size, sizeof(int));
-  int *AsendOffsets = (int*) calloc(mesh.size+1, sizeof(int));
-  int *ArecvOffsets = (int*) calloc(mesh.size+1, sizeof(int));
+  memory<parAlmond::parCOO::nonZero_t> sendNonZeros(nnzLocal);
+  memory<int> AsendCounts (mesh.size, 0);
+  memory<int> ArecvCounts (mesh.size);
+  memory<int> AsendOffsets(mesh.size+1);
+  memory<int> ArecvOffsets(mesh.size+1);
 
-  if(mesh.rank==0) {printf("Building full FEM matrix...");fflush(stdout);}
+  if(Comm::World().rank()==0) {printf("Building full FEM matrix...");fflush(stdout);}
 
   dlong cnt =0;
   for (dlong e=0;e<mesh.Nelements;e++) {
@@ -757,7 +738,7 @@ void elliptic_t::BuildOperatorMatrixContinuousHex3D(parAlmond::parCOO& A) {
             if ((ny==my)&&(nz==mz)) {
               for (int k=0;k<mesh.Nq;k++) {
                 id = k+ny*mesh.Nq+nz*mesh.Nq*mesh.Nq;
-                dfloat Grr = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + G00ID*mesh.Np];
+                dfloat Grr = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + mesh.G00ID*mesh.Np];
 
                 val += Grr*mesh.D[nx+k*mesh.Nq]*mesh.D[mx+k*mesh.Nq];
               }
@@ -765,28 +746,28 @@ void elliptic_t::BuildOperatorMatrixContinuousHex3D(parAlmond::parCOO& A) {
 
             if (nz==mz) {
               id = mx+ny*mesh.Nq+nz*mesh.Nq*mesh.Nq;
-              dfloat Grs = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + G01ID*mesh.Np];
+              dfloat Grs = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + mesh.G01ID*mesh.Np];
               val += Grs*mesh.D[nx+mx*mesh.Nq]*mesh.D[my+ny*mesh.Nq];
 
               id = nx+my*mesh.Nq+nz*mesh.Nq*mesh.Nq;
-              dfloat Gsr = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + G01ID*mesh.Np];
+              dfloat Gsr = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + mesh.G01ID*mesh.Np];
               val += Gsr*mesh.D[mx+nx*mesh.Nq]*mesh.D[ny+my*mesh.Nq];
             }
 
             if (ny==my) {
               id = mx+ny*mesh.Nq+nz*mesh.Nq*mesh.Nq;
-              dfloat Grt = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + G02ID*mesh.Np];
+              dfloat Grt = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + mesh.G02ID*mesh.Np];
               val += Grt*mesh.D[nx+mx*mesh.Nq]*mesh.D[mz+nz*mesh.Nq];
 
               id = nx+ny*mesh.Nq+mz*mesh.Nq*mesh.Nq;
-              dfloat Gst = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + G02ID*mesh.Np];
+              dfloat Gst = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + mesh.G02ID*mesh.Np];
               val += Gst*mesh.D[mx+nx*mesh.Nq]*mesh.D[nz+mz*mesh.Nq];
             }
 
             if ((nx==mx)&&(nz==mz)) {
               for (int k=0;k<mesh.Nq;k++) {
                 id = nx+k*mesh.Nq+nz*mesh.Nq*mesh.Nq;
-                dfloat Gss = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + G11ID*mesh.Np];
+                dfloat Gss = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + mesh.G11ID*mesh.Np];
 
                 val += Gss*mesh.D[ny+k*mesh.Nq]*mesh.D[my+k*mesh.Nq];
               }
@@ -794,18 +775,18 @@ void elliptic_t::BuildOperatorMatrixContinuousHex3D(parAlmond::parCOO& A) {
 
             if (nx==mx) {
               id = nx+my*mesh.Nq+nz*mesh.Nq*mesh.Nq;
-              dfloat Gst = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + G12ID*mesh.Np];
+              dfloat Gst = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + mesh.G12ID*mesh.Np];
               val += Gst*mesh.D[ny+my*mesh.Nq]*mesh.D[mz+nz*mesh.Nq];
 
               id = nx+ny*mesh.Nq+mz*mesh.Nq*mesh.Nq;
-              dfloat Gts = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + G12ID*mesh.Np];
+              dfloat Gts = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + mesh.G12ID*mesh.Np];
               val += Gts*mesh.D[my+ny*mesh.Nq]*mesh.D[nz+mz*mesh.Nq];
             }
 
             if ((nx==mx)&&(ny==my)) {
               for (int k=0;k<mesh.Nq;k++) {
                 id = nx+ny*mesh.Nq+k*mesh.Nq*mesh.Nq;
-                dfloat Gtt = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + G22ID*mesh.Np];
+                dfloat Gtt = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + mesh.G22ID*mesh.Np];
 
                 val += Gtt*mesh.D[nz+k*mesh.Nq]*mesh.D[mz+k*mesh.Nq];
               }
@@ -813,7 +794,7 @@ void elliptic_t::BuildOperatorMatrixContinuousHex3D(parAlmond::parCOO& A) {
 
             if ((nx==mx)&&(ny==my)&&(nz==mz)) {
               id = nx + ny*mesh.Nq+nz*mesh.Nq*mesh.Nq;
-              dfloat JW = mesh.ggeo[e*mesh.Np*mesh.Nggeo + id + GWJID*mesh.Np];
+              dfloat JW = mesh.wJ[e*mesh.Np + id];
               val += JW*lambda;
             }
 
@@ -834,14 +815,14 @@ void elliptic_t::BuildOperatorMatrixContinuousHex3D(parAlmond::parCOO& A) {
   }
 
   // sort by row ordering
-  std::sort(sendNonZeros, sendNonZeros+cnt,
-            [](const parAlmond::parCOO::nonZero_t& a,
-               const parAlmond::parCOO::nonZero_t& b) {
-              if (a.row < b.row) return true;
-              if (a.row > b.row) return false;
+  sort(sendNonZeros.ptr(), sendNonZeros.ptr()+cnt,
+      [](const parAlmond::parCOO::nonZero_t& a,
+         const parAlmond::parCOO::nonZero_t& b) {
+        if (a.row < b.row) return true;
+        if (a.row > b.row) return false;
 
-              return a.col < b.col;
-            });
+        return a.col < b.col;
+      });
 
   // count how many non-zeros to send to each process
   int rr=0;
@@ -852,32 +833,33 @@ void elliptic_t::BuildOperatorMatrixContinuousHex3D(parAlmond::parCOO& A) {
   }
 
   // find how many nodes to expect (should use sparse version)
-  MPI_Alltoall(AsendCounts, 1, MPI_INT, ArecvCounts, 1, MPI_INT, mesh.comm);
+  mesh.comm.Alltoall(AsendCounts, ArecvCounts);
 
   // find send and recv offsets for gather
   A.nnz = 0;
+  AsendOffsets[0] = 0;
+  ArecvOffsets[0] = 0;
   for(int r=0;r<mesh.size;++r){
     AsendOffsets[r+1] = AsendOffsets[r] + AsendCounts[r];
     ArecvOffsets[r+1] = ArecvOffsets[r] + ArecvCounts[r];
     A.nnz += ArecvCounts[r];
   }
 
-  A.entries = (parAlmond::parCOO::nonZero_t*) calloc(A.nnz, sizeof(parAlmond::parCOO::nonZero_t));
+  A.entries.malloc(A.nnz);
 
   // determine number to receive
-  MPI_Alltoallv(sendNonZeros, AsendCounts, AsendOffsets, parAlmond::MPI_NONZERO_T,
-                   A.entries, ArecvCounts, ArecvOffsets, parAlmond::MPI_NONZERO_T,
-                   mesh.comm);
+  mesh.comm.Alltoallv(sendNonZeros, AsendCounts, AsendOffsets,
+                      A.entries,    ArecvCounts, ArecvOffsets);
 
   // sort received non-zero entries by row block (may need to switch compareRowColumn tests)
-  std::sort(A.entries, A.entries+A.nnz,
-            [](const parAlmond::parCOO::nonZero_t& a,
-               const parAlmond::parCOO::nonZero_t& b) {
-              if (a.row < b.row) return true;
-              if (a.row > b.row) return false;
+  sort(A.entries.ptr(), A.entries.ptr()+A.nnz,
+      [](const parAlmond::parCOO::nonZero_t& a,
+         const parAlmond::parCOO::nonZero_t& b) {
+        if (a.row < b.row) return true;
+        if (a.row > b.row) return false;
 
-              return a.col < b.col;
-            });
+        return a.col < b.col;
+      });
 
   // compress duplicates
   cnt = 0;
@@ -894,12 +876,5 @@ void elliptic_t::BuildOperatorMatrixContinuousHex3D(parAlmond::parCOO& A) {
   if (A.nnz) cnt++;
   A.nnz = cnt;
 
-  if(mesh.rank==0) printf("done.\n");
-
-  MPI_Barrier(mesh.comm);
-  free(sendNonZeros);
-  free(AsendCounts);
-  free(ArecvCounts);
-  free(AsendOffsets);
-  free(ArecvOffsets);
+  if(Comm::World().rank()==0) printf("done.\n");
 }
diff --git a/solvers/elliptic/src/ellipticBuildOperatorMatrixIpdg.cpp b/solvers/elliptic/src/ellipticBuildOperatorMatrixIpdg.cpp
index 5dc01bf1d..c4abf59a9 100644
--- a/solvers/elliptic/src/ellipticBuildOperatorMatrixIpdg.cpp
+++ b/solvers/elliptic/src/ellipticBuildOperatorMatrixIpdg.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,13 +25,18 @@ SOFTWARE.
 */
 
 #include "elliptic.hpp"
-#include "mesh/meshDefines2D.h"
-#include "mesh/meshDefines3D.h"
+
+#ifdef GLIBCXX_PARALLEL
+#include <parallel/algorithm>
+using __gnu_parallel::sort;
+#else
+using std::sort;
+#endif
 
 void elliptic_t::BuildOperatorMatrixIpdg(parAlmond::parCOO& A){
 
   switch(mesh.elementType){
-  case TRIANGLES:
+  case Mesh::TRIANGLES:
   {
     if(mesh.dim==2)
       BuildOperatorMatrixIpdgTri2D(A);
@@ -39,25 +44,22 @@ void elliptic_t::BuildOperatorMatrixIpdg(parAlmond::parCOO& A){
       BuildOperatorMatrixIpdgTri3D(A);
     break;
   }
-  case QUADRILATERALS:{
+  case Mesh::QUADRILATERALS:{
     if(mesh.dim==2)
       BuildOperatorMatrixIpdgQuad2D(A);
     else
       BuildOperatorMatrixIpdgQuad3D(A);
     break;
   }
-  case TETRAHEDRA:
+  case Mesh::TETRAHEDRA:
     BuildOperatorMatrixIpdgTet3D(A); break;
-  case HEXAHEDRA:
+  case Mesh::HEXAHEDRA:
     BuildOperatorMatrixIpdgHex3D(A); break;
   }
-
 }
 
 void elliptic_t::BuildOperatorMatrixIpdgTri2D(parAlmond::parCOO& A){
 
-  int rankM = mesh.rank;
-
   int Np = mesh.Np;
   int Nfp = mesh.Nfp;
   int Nfaces = mesh.Nfaces;
@@ -67,35 +69,31 @@ void elliptic_t::BuildOperatorMatrixIpdgTri2D(parAlmond::parCOO& A){
   hlong Nnum = Np*Nelements;
 
   // create a global numbering system
-  hlong *globalIds = (hlong *) calloc((Nelements+mesh.totalHaloPairs)*Np,sizeof(hlong));
+  memory<hlong> globalIds((Nelements+mesh.totalHaloPairs)*Np);
 
   // every degree of freedom has its own global id
-  A.globalRowStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong));
-  A.globalColStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong));
-  MPI_Allgather(&Nnum, 1, MPI_HLONG, A.globalRowStarts+1, 1, MPI_HLONG, mesh.comm);
+  A.globalRowStarts.malloc(mesh.size+1,0);
+  A.globalColStarts.malloc(mesh.size+1,0);
+  mesh.comm.Allgather(Nnum, A.globalRowStarts+1);
   for(int r=0;r<mesh.size;++r) {
     A.globalRowStarts[r+1] = A.globalRowStarts[r]+A.globalRowStarts[r+1];
     A.globalColStarts[r+1] = A.globalRowStarts[r+1];
   }
 
   /* so find number of elements on each rank */
-  dlong *rankNelements = (dlong*) calloc(mesh.size, sizeof(dlong));
-  hlong *rankStarts = (hlong*) calloc(mesh.size+1, sizeof(hlong));
-  MPI_Allgather(&Nelements, 1, MPI_DLONG,
-    rankNelements, 1, MPI_DLONG, mesh.comm);
-  //find offsets
-  for(int r=0;r<mesh.size;++r){
-    rankStarts[r+1] = rankStarts[r]+rankNelements[r];
-  }
+  hlong gNelements = Nelements;
+  hlong globalElementOffset = Nelements;
+  mesh.comm.Scan(gNelements, globalElementOffset);
+  globalElementOffset = globalElementOffset - Nelements;
   //use the offsets to set a global id
-  for (dlong e =0;e<Nelements;e++) {
+  for (dlong e=0;e<Nelements;e++) {
     for (int n=0;n<Np;n++) {
-      globalIds[e*Np +n] = n + (e + rankStarts[rankM])*Np;
+      globalIds[e*Np + n] = n + (e + globalElementOffset)*Np;
     }
   }
 
   /* do a halo exchange of global node numbers */
-  mesh.halo->Exchange(globalIds, Np, ogs_hlong);
+  mesh.halo.Exchange(globalIds, Np);
 
   dlong nnzLocalBound = Np*Np*(1+Nfaces)*Nelements;
 
@@ -103,7 +101,7 @@ void elliptic_t::BuildOperatorMatrixIpdgTri2D(parAlmond::parCOO& A){
   dfloat tol = 1e-8;
 
   // surface mass matrices MS = MM*LIFT
-  dfloat *MS = (dfloat *) calloc(Nfaces*Nfp*Nfp,sizeof(dfloat));
+  memory<dfloat> MS(Nfaces*Nfp*Nfp);
   for (int f=0;f<Nfaces;f++) {
     for (int n=0;n<Nfp;n++) {
       int fn = mesh.faceNodes[f*Nfp+n];
@@ -123,22 +121,22 @@ void elliptic_t::BuildOperatorMatrixIpdgTri2D(parAlmond::parCOO& A){
   // reset non-zero counter
   dlong nnz = 0;
 
-  A.entries = (parAlmond::parCOO::nonZero_t*) calloc(nnzLocalBound, sizeof(parAlmond::parCOO::nonZero_t));
+  A.entries.malloc(nnzLocalBound);
 
-  dfloat *SM = (dfloat*) calloc(Np*Np,sizeof(dfloat));
-  dfloat *SP = (dfloat*) calloc(Np*Np,sizeof(dfloat));
+  memory<dfloat> SM(Np*Np);
+  memory<dfloat> SP(Np*Np);
 
-  if(rankM==0) {printf("Building full IPDG matrix...");fflush(stdout);}
+  if(Comm::World().rank()==0) {printf("Building full IPDG matrix...");fflush(stdout);}
 
   // loop over all elements
   for(dlong eM=0;eM<Nelements;++eM){
 
     dlong vbase = eM*mesh.Nvgeo;
-    dfloat drdx = mesh.vgeo[vbase+RXID];
-    dfloat drdy = mesh.vgeo[vbase+RYID];
-    dfloat dsdx = mesh.vgeo[vbase+SXID];
-    dfloat dsdy = mesh.vgeo[vbase+SYID];
-    dfloat J = mesh.vgeo[vbase+JID];
+    dfloat drdx = mesh.vgeo[vbase+mesh.RXID];
+    dfloat drdy = mesh.vgeo[vbase+mesh.RYID];
+    dfloat dsdx = mesh.vgeo[vbase+mesh.SXID];
+    dfloat dsdy = mesh.vgeo[vbase+mesh.SYID];
+    dfloat J = mesh.vgeo[vbase+mesh.JID];
 
     /* start with stiffness matrix  */
     for(int n=0;n<Np;++n){
@@ -160,20 +158,20 @@ void elliptic_t::BuildOperatorMatrixIpdgTri2D(parAlmond::parCOO& A){
 
       // load surface geofactors for this face
       dlong sid = mesh.Nsgeo*(eM*Nfaces+fM);
-      dfloat nx = mesh.sgeo[sid+NXID];
-      dfloat ny = mesh.sgeo[sid+NYID];
-      dfloat sJ = mesh.sgeo[sid+SJID];
-      dfloat hinv = mesh.sgeo[sid+IHID];
+      dfloat nx = mesh.sgeo[sid+mesh.NXID];
+      dfloat ny = mesh.sgeo[sid+mesh.NYID];
+      dfloat sJ = mesh.sgeo[sid+mesh.SJID];
+      dfloat hinv = mesh.sgeo[sid+mesh.IHID];
       dfloat penalty = tau*hinv;
 
       dlong eP = mesh.EToE[eM*Nfaces+fM];
       if (eP < 0) eP = eM;
 
       dlong vbaseP = eP*mesh.Nvgeo;
-      dfloat drdxP = mesh.vgeo[vbaseP+RXID];
-      dfloat drdyP = mesh.vgeo[vbaseP+RYID];
-      dfloat dsdxP = mesh.vgeo[vbaseP+SXID];
-      dfloat dsdyP = mesh.vgeo[vbaseP+SYID];
+      dfloat drdxP = mesh.vgeo[vbaseP+mesh.RXID];
+      dfloat drdyP = mesh.vgeo[vbaseP+mesh.RYID];
+      dfloat dsdxP = mesh.vgeo[vbaseP+mesh.SXID];
+      dfloat dsdyP = mesh.vgeo[vbaseP+mesh.SYID];
 
       int bcD = 0, bcN =0;
       int bc = mesh.EToB[fM+Nfaces*eM]; //raw boundary flag
@@ -194,7 +192,7 @@ void elliptic_t::BuildOperatorMatrixIpdgTri2D(parAlmond::parCOO& A){
       eP = mesh.EToE[eM*Nfaces+fM];
 
       // mass matrix for this face
-      dfloat *MSf = MS+fM*Nfp*Nfp;
+      memory<dfloat> MSf = MS+fM*Nfp*Nfp;
 
       // penalty term just involves face nodes
       for(int n=0;n<Nfp;++n){
@@ -264,7 +262,7 @@ void elliptic_t::BuildOperatorMatrixIpdgTri2D(parAlmond::parCOO& A){
       for(int n=0;n<Np;++n){
         for(int m=0;m<Np;++m){
           dfloat val = SP[n*Np+m];
-          if(fabs(val)>tol){
+          if(std::abs(val)>tol){
             A.entries[nnz].row = globalIds[eM*Np + n];
             A.entries[nnz].col = globalIds[eP*Np + m];
             A.entries[nnz].val = val;
@@ -277,7 +275,7 @@ void elliptic_t::BuildOperatorMatrixIpdgTri2D(parAlmond::parCOO& A){
     for(int n=0;n<Np;++n){
       for(int m=0;m<Np;++m){
         dfloat val = SM[n*Np+m];
-        if(fabs(val)>tol){
+        if(std::abs(val)>tol){
           A.entries[nnz].row = globalIds[eM*Np + n];
           A.entries[nnz].col = globalIds[eM*Np + m];
           A.entries[nnz].val = val;
@@ -289,19 +287,19 @@ void elliptic_t::BuildOperatorMatrixIpdgTri2D(parAlmond::parCOO& A){
 
   //printf("nnz = %d\n", nnz);
 
-  std::sort(A.entries, A.entries+nnz,
-            [](const parAlmond::parCOO::nonZero_t& a,
-               const parAlmond::parCOO::nonZero_t& b) {
-              if (a.row < b.row) return true;
-              if (a.row > b.row) return false;
+  sort(A.entries.ptr(), A.entries.ptr()+nnz,
+      [](const parAlmond::parCOO::nonZero_t& a,
+         const parAlmond::parCOO::nonZero_t& b) {
+        if (a.row < b.row) return true;
+        if (a.row > b.row) return false;
 
-              return a.col < b.col;
-            });
+        return a.col < b.col;
+      });
 
   //*A = (parAlmond::parCOO::nonZero_t*) realloc(*A, nnz*sizeof(parAlmond::parCOO::nonZero_t));
   A.nnz = nnz;
 
-  if(rankM==0) printf("done.\n");
+  if(Comm::World().rank()==0) printf("done.\n");
 
 #if 0
   dfloat* Ap = (dfloat *) calloc(Np*Np*Nelements*Nelements,sizeof(dfloat));
@@ -319,17 +317,10 @@ void elliptic_t::BuildOperatorMatrixIpdgTri2D(parAlmond::parCOO& A){
     printf("\n");
   }
 #endif
-
-  free(globalIds);
-
-  free(SM); free(SP);
-  free(MS);
 }
 
 void elliptic_t::BuildOperatorMatrixIpdgTri3D(parAlmond::parCOO& A){
 
-  int rankM = mesh.rank;
-
   int Np = mesh.Np;
   int Nfp = mesh.Nfp;
   int Nfaces = mesh.Nfaces;
@@ -339,35 +330,31 @@ void elliptic_t::BuildOperatorMatrixIpdgTri3D(parAlmond::parCOO& A){
   hlong Nnum = Np*Nelements;
 
   // create a global numbering system
-  hlong *globalIds = (hlong *) calloc((Nelements+mesh.totalHaloPairs)*Np,sizeof(hlong));
+  memory<hlong> globalIds((Nelements+mesh.totalHaloPairs)*Np);
 
   // every degree of freedom has its own global id
-  A.globalRowStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong));
-  A.globalColStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong));
-  MPI_Allgather(&Nnum, 1, MPI_HLONG, A.globalRowStarts+1, 1, MPI_HLONG, mesh.comm);
+  A.globalRowStarts.malloc(mesh.size+1,0);
+  A.globalColStarts.malloc(mesh.size+1,0);
+  mesh.comm.Allgather(Nnum, A.globalRowStarts+1);
   for(int r=0;r<mesh.size;++r) {
     A.globalRowStarts[r+1] = A.globalRowStarts[r]+A.globalRowStarts[r+1];
     A.globalColStarts[r+1] = A.globalRowStarts[r+1];
   }
 
   /* so find number of elements on each rank */
-  dlong *rankNelements = (dlong*) calloc(mesh.size, sizeof(dlong));
-  hlong *rankStarts = (hlong*) calloc(mesh.size+1, sizeof(hlong));
-  MPI_Allgather(&Nelements, 1, MPI_DLONG,
-    rankNelements, 1, MPI_DLONG, mesh.comm);
-  //find offsets
-  for(int r=0;r<mesh.size;++r){
-    rankStarts[r+1] = rankStarts[r]+rankNelements[r];
-  }
+  hlong gNelements = Nelements;
+  hlong globalElementOffset = Nelements;
+  mesh.comm.Scan(gNelements, globalElementOffset);
+  globalElementOffset = globalElementOffset - Nelements;
   //use the offsets to set a global id
-  for (dlong e =0;e<Nelements;e++) {
+  for (dlong e=0;e<Nelements;e++) {
     for (int n=0;n<Np;n++) {
-      globalIds[e*Np +n] = n + (e + rankStarts[rankM])*Np;
+      globalIds[e*Np + n] = n + (e + globalElementOffset)*Np;
     }
   }
 
   /* do a halo exchange of global node numbers */
-  mesh.halo->Exchange(globalIds, Np, ogs_hlong);
+  mesh.halo.Exchange(globalIds, Np);
 
   dlong nnzLocalBound = Np*Np*(1+Nfaces)*Nelements;
 
@@ -375,7 +362,7 @@ void elliptic_t::BuildOperatorMatrixIpdgTri3D(parAlmond::parCOO& A){
   dfloat tol = 1e-8;
 
   // surface mass matrices MS = MM*LIFT
-  dfloat *MS = (dfloat *) calloc(Nfaces*Nfp*Nfp,sizeof(dfloat));
+  memory<dfloat> MS(Nfaces*Nfp*Nfp);
   for (int f=0;f<Nfaces;f++) {
     for (int n=0;n<Nfp;n++) {
       int fn = mesh.faceNodes[f*Nfp+n];
@@ -395,24 +382,24 @@ void elliptic_t::BuildOperatorMatrixIpdgTri3D(parAlmond::parCOO& A){
   // reset non-zero counter
   dlong nnz = 0;
 
-  A.entries = (parAlmond::parCOO::nonZero_t*) calloc(nnzLocalBound, sizeof(parAlmond::parCOO::nonZero_t));
+  A.entries.malloc(nnzLocalBound);
 
-  dfloat *SM = (dfloat*) calloc(Np*Np,sizeof(dfloat));
-  dfloat *SP = (dfloat*) calloc(Np*Np,sizeof(dfloat));
+  memory<dfloat> SM(Np*Np);
+  memory<dfloat> SP(Np*Np);
 
-  if(rankM==0) {printf("Building full IPDG matrix...");fflush(stdout);}
+  if(Comm::World().rank()==0) {printf("Building full IPDG matrix...");fflush(stdout);}
 
   // loop over all elements
   for(dlong eM=0;eM<Nelements;++eM){
 
     dlong vbase = eM*mesh.Nvgeo;
-    dfloat drdx = mesh.vgeo[vbase+RXID];
-    dfloat drdy = mesh.vgeo[vbase+RYID];
-    dfloat drdz = mesh.vgeo[vbase+RZID];
-    dfloat dsdx = mesh.vgeo[vbase+SXID];
-    dfloat dsdy = mesh.vgeo[vbase+SYID];
-    dfloat dsdz = mesh.vgeo[vbase+SZID];
-    dfloat J = mesh.vgeo[vbase+JID];
+    dfloat drdx = mesh.vgeo[vbase+mesh.RXID];
+    dfloat drdy = mesh.vgeo[vbase+mesh.RYID];
+    dfloat drdz = mesh.vgeo[vbase+mesh.RZID];
+    dfloat dsdx = mesh.vgeo[vbase+mesh.SXID];
+    dfloat dsdy = mesh.vgeo[vbase+mesh.SYID];
+    dfloat dsdz = mesh.vgeo[vbase+mesh.SZID];
+    dfloat J = mesh.vgeo[vbase+mesh.JID];
 
     /* start with stiffness matrix  */
     for(int n=0;n<Np;++n){
@@ -439,23 +426,23 @@ void elliptic_t::BuildOperatorMatrixIpdgTri3D(parAlmond::parCOO& A){
 
       // load surface geofactors for this face
       dlong sid = mesh.Nsgeo*(eM*Nfaces+fM);
-      dfloat nx = mesh.sgeo[sid+NXID];
-      dfloat ny = mesh.sgeo[sid+NYID];
-      dfloat nz = mesh.sgeo[sid+NZID];
-      dfloat sJ = mesh.sgeo[sid+SJID];
-      dfloat hinv = mesh.sgeo[sid+IHID];
+      dfloat nx = mesh.sgeo[sid+mesh.NXID];
+      dfloat ny = mesh.sgeo[sid+mesh.NYID];
+      dfloat nz = mesh.sgeo[sid+mesh.NZID];
+      dfloat sJ = mesh.sgeo[sid+mesh.SJID];
+      dfloat hinv = mesh.sgeo[sid+mesh.IHID];
       dfloat penalty = tau*hinv;
 
       dlong eP = mesh.EToE[eM*Nfaces+fM];
       if (eP < 0) eP = eM;
 
       dlong vbaseP = eP*mesh.Nvgeo;
-      dfloat drdxP = mesh.vgeo[vbaseP+RXID];
-      dfloat drdyP = mesh.vgeo[vbaseP+RYID];
-      dfloat drdzP = mesh.vgeo[vbaseP+RZID];
-      dfloat dsdxP = mesh.vgeo[vbaseP+SXID];
-      dfloat dsdyP = mesh.vgeo[vbaseP+SYID];
-      dfloat dsdzP = mesh.vgeo[vbaseP+SZID];
+      dfloat drdxP = mesh.vgeo[vbaseP+mesh.RXID];
+      dfloat drdyP = mesh.vgeo[vbaseP+mesh.RYID];
+      dfloat drdzP = mesh.vgeo[vbaseP+mesh.RZID];
+      dfloat dsdxP = mesh.vgeo[vbaseP+mesh.SXID];
+      dfloat dsdyP = mesh.vgeo[vbaseP+mesh.SYID];
+      dfloat dsdzP = mesh.vgeo[vbaseP+mesh.SZID];
 
       int bcD = 0, bcN =0;
       int bc = mesh.EToB[fM+Nfaces*eM]; //raw boundary flag
@@ -476,7 +463,7 @@ void elliptic_t::BuildOperatorMatrixIpdgTri3D(parAlmond::parCOO& A){
       eP = mesh.EToE[eM*Nfaces+fM];
 
       // mass matrix for this face
-      dfloat *MSf = MS+fM*Nfp*Nfp;
+      memory<dfloat> MSf = MS+fM*Nfp*Nfp;
 
       // penalty term just involves face nodes
       for(int n=0;n<Nfp;++n){
@@ -553,7 +540,7 @@ void elliptic_t::BuildOperatorMatrixIpdgTri3D(parAlmond::parCOO& A){
       for(int n=0;n<Np;++n){
         for(int m=0;m<Np;++m){
           dfloat val = SP[n*Np+m];
-          if(fabs(val)>tol){
+          if(std::abs(val)>tol){
             A.entries[nnz].row = globalIds[eM*Np + n];
             A.entries[nnz].col = globalIds[eP*Np + m];
             A.entries[nnz].val = val;
@@ -566,7 +553,7 @@ void elliptic_t::BuildOperatorMatrixIpdgTri3D(parAlmond::parCOO& A){
     for(int n=0;n<Np;++n){
       for(int m=0;m<Np;++m){
         dfloat val = SM[n*Np+m];
-        if(fabs(val)>tol){
+        if(std::abs(val)>tol){
           A.entries[nnz].row = globalIds[eM*Np + n];
           A.entries[nnz].col = globalIds[eM*Np + m];
           A.entries[nnz].val = val;
@@ -578,32 +565,25 @@ void elliptic_t::BuildOperatorMatrixIpdgTri3D(parAlmond::parCOO& A){
 
   //printf("nnz = %d\n", nnz);
 
-  std::sort(A.entries, A.entries+nnz,
-            [](const parAlmond::parCOO::nonZero_t& a,
-               const parAlmond::parCOO::nonZero_t& b) {
-              if (a.row < b.row) return true;
-              if (a.row > b.row) return false;
+  sort(A.entries.ptr(), A.entries.ptr()+nnz,
+      [](const parAlmond::parCOO::nonZero_t& a,
+         const parAlmond::parCOO::nonZero_t& b) {
+        if (a.row < b.row) return true;
+        if (a.row > b.row) return false;
 
-              return a.col < b.col;
-            });
+        return a.col < b.col;
+      });
 
   //*A = (parAlmond::parCOO::nonZero_t*) realloc(*A, nnz*sizeof(parAlmond::parCOO::nonZero_t));
   A.nnz = nnz;
 
-  if(rankM==0) printf("done.\n");
-
-  free(globalIds);
-
-  free(SM); free(SP);
-  free(MS);
+  if(Comm::World().rank()==0) printf("done.\n");
 }
 
 
 
 void elliptic_t::BuildOperatorMatrixIpdgQuad2D(parAlmond::parCOO& A){
 
-  int rankM = mesh.rank;
-
   int Np = mesh.Np;
   int Nfaces = mesh.Nfaces;
   dlong Nelements = mesh.Nelements;
@@ -611,35 +591,31 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad2D(parAlmond::parCOO& A){
   hlong Nnum = mesh.Np*mesh.Nelements;
 
   // create a global numbering system
-  hlong *globalIds = (hlong *) calloc((Nelements+mesh.totalHaloPairs)*Np,sizeof(hlong));
+  memory<hlong> globalIds((Nelements+mesh.totalHaloPairs)*Np);
 
   // every degree of freedom has its own global id
-  A.globalRowStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong));
-  A.globalColStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong));
-  MPI_Allgather(&Nnum, 1, MPI_HLONG, A.globalRowStarts+1, 1, MPI_HLONG, mesh.comm);
+  A.globalRowStarts.malloc(mesh.size+1,0);
+  A.globalColStarts.malloc(mesh.size+1,0);
+  mesh.comm.Allgather(Nnum, A.globalRowStarts+1);
   for(int r=0;r<mesh.size;++r) {
     A.globalRowStarts[r+1] = A.globalRowStarts[r]+A.globalRowStarts[r+1];
     A.globalColStarts[r+1] = A.globalRowStarts[r+1];
   }
 
   /* so find number of elements on each rank */
-  dlong *rankNelements = (dlong*) calloc(mesh.size, sizeof(dlong));
-  hlong *rankStarts = (hlong*) calloc(mesh.size+1, sizeof(hlong));
-  MPI_Allgather(&Nelements, 1, MPI_DLONG,
-    rankNelements, 1, MPI_DLONG, mesh.comm);
-  //find offsets
-  for(int r=0;r<mesh.size;++r){
-    rankStarts[r+1] = rankStarts[r]+rankNelements[r];
-  }
+  hlong gNelements = Nelements;
+  hlong globalElementOffset = Nelements;
+  mesh.comm.Scan(gNelements, globalElementOffset);
+  globalElementOffset = globalElementOffset - Nelements;
   //use the offsets to set a global id
-  for (dlong e =0;e<Nelements;e++) {
+  for (dlong e=0;e<Nelements;e++) {
     for (int n=0;n<Np;n++) {
-      globalIds[e*Np +n] = n + (e + rankStarts[rankM])*Np;
+      globalIds[e*Np + n] = n + (e + globalElementOffset)*Np;
     }
   }
 
   /* do a halo exchange of global node numbers */
-  mesh.halo->Exchange(globalIds, Np, ogs_hlong);
+  mesh.halo.Exchange(globalIds, Np);
 
   dlong nnzLocalBound = Np*Np*(1+Nfaces)*Nelements;
 
@@ -647,9 +623,9 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad2D(parAlmond::parCOO& A){
   dfloat tol = 1e-8;
 
   // build some monolithic basis arrays (use Dr,Ds,Dt and insert MM instead of weights for tet version)
-  dfloat *B  = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat));
-  dfloat *Br = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat));
-  dfloat *Bs = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat));
+  memory<dfloat> B (mesh.Np*mesh.Np, 0.0);
+  memory<dfloat> Br(mesh.Np*mesh.Np, 0.0);
+  memory<dfloat> Bs(mesh.Np*mesh.Np, 0.0);
 
   int mode = 0;
   for(int nj=0;nj<mesh.N+1;++nj){
@@ -675,9 +651,9 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad2D(parAlmond::parCOO& A){
     }
   }
 
-  A.entries = (parAlmond::parCOO::nonZero_t*) calloc(nnzLocalBound,sizeof(parAlmond::parCOO::nonZero_t));
+  A.entries.malloc(nnzLocalBound);
 
-  if(rankM==0) {printf("Building full IPDG matrix...");fflush(stdout);}
+  if(Comm::World().rank()==0) {printf("Building full IPDG matrix...");fflush(stdout);}
 
   // reset non-zero counter
   dlong nnz = 0;
@@ -693,11 +669,11 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad2D(parAlmond::parCOO& A){
         // (grad phi_n, grad phi_m)_{D^e}
         for(int i=0;i<mesh.Np;++i){
           dlong base = eM*mesh.Np*mesh.Nvgeo + i;
-          dfloat drdx = mesh.vgeo[base+mesh.Np*RXID];
-          dfloat drdy = mesh.vgeo[base+mesh.Np*RYID];
-          dfloat dsdx = mesh.vgeo[base+mesh.Np*SXID];
-          dfloat dsdy = mesh.vgeo[base+mesh.Np*SYID];
-          dfloat JW   = mesh.vgeo[base+mesh.Np*JWID];
+          dfloat drdx = mesh.vgeo[base+mesh.Np*mesh.RXID];
+          dfloat drdy = mesh.vgeo[base+mesh.Np*mesh.RYID];
+          dfloat dsdx = mesh.vgeo[base+mesh.Np*mesh.SXID];
+          dfloat dsdy = mesh.vgeo[base+mesh.Np*mesh.SYID];
+          dfloat JW   = mesh.vgeo[base+mesh.Np*mesh.JWID];
 
           int idn = n*mesh.Np+i;
           int idm = m*mesh.Np+i;
@@ -718,27 +694,27 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad2D(parAlmond::parCOO& A){
 
             // grab vol geofacs at surface nodes
             dlong baseM = eM*mesh.Np*mesh.Nvgeo + vidM;
-            dfloat drdxM = mesh.vgeo[baseM+mesh.Np*RXID];
-            dfloat drdyM = mesh.vgeo[baseM+mesh.Np*RYID];
-            dfloat dsdxM = mesh.vgeo[baseM+mesh.Np*SXID];
-            dfloat dsdyM = mesh.vgeo[baseM+mesh.Np*SYID];
+            dfloat drdxM = mesh.vgeo[baseM+mesh.Np*mesh.RXID];
+            dfloat drdyM = mesh.vgeo[baseM+mesh.Np*mesh.RYID];
+            dfloat dsdxM = mesh.vgeo[baseM+mesh.Np*mesh.SXID];
+            dfloat dsdyM = mesh.vgeo[baseM+mesh.Np*mesh.SYID];
 
             // double check vol geometric factors are in halo storage of vgeo
             dlong idM     = eM*mesh.Nfp*mesh.Nfaces+fM*mesh.Nfp+i;
             int vidP      = (int) (mesh.vmapP[idM]%mesh.Np); // only use this to identify location of positive trace vgeo
             dlong localEP = mesh.vmapP[idM]/mesh.Np;
             dlong baseP   = localEP*mesh.Np*mesh.Nvgeo + vidP; // use local offset for vgeo in halo
-            dfloat drdxP = mesh.vgeo[baseP+mesh.Np*RXID];
-            dfloat drdyP = mesh.vgeo[baseP+mesh.Np*RYID];
-            dfloat dsdxP = mesh.vgeo[baseP+mesh.Np*SXID];
-            dfloat dsdyP = mesh.vgeo[baseP+mesh.Np*SYID];
+            dfloat drdxP = mesh.vgeo[baseP+mesh.Np*mesh.RXID];
+            dfloat drdyP = mesh.vgeo[baseP+mesh.Np*mesh.RYID];
+            dfloat dsdxP = mesh.vgeo[baseP+mesh.Np*mesh.SXID];
+            dfloat dsdyP = mesh.vgeo[baseP+mesh.Np*mesh.SYID];
 
             // grab surface geometric factors
             dlong base = mesh.Nsgeo*(eM*mesh.Nfp*mesh.Nfaces + fM*mesh.Nfp + i);
-            dfloat nx = mesh.sgeo[base+NXID];
-            dfloat ny = mesh.sgeo[base+NYID];
-            dfloat wsJ = mesh.sgeo[base+WSJID];
-            dfloat hinv = mesh.sgeo[base+IHID];
+            dfloat nx = mesh.sgeo[base+mesh.NXID];
+            dfloat ny = mesh.sgeo[base+mesh.NYID];
+            dfloat wsJ = mesh.sgeo[base+mesh.WSJID];
+            dfloat hinv = mesh.sgeo[base+mesh.IHID];
 
             // form negative trace terms in IPDG
             int idnM = n*mesh.Np+vidM;
@@ -791,7 +767,7 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad2D(parAlmond::parCOO& A){
               AnmP += -0.5*wsJ*penalty*lnM*lmP; // -((tau/h)*ln^-,lm^+)
             }
           }
-          if(fabs(AnmP)>tol){
+          if(std::abs(AnmP)>tol){
             // remote info
             dlong eP    = mesh.EToE[eM*mesh.Nfaces+fM];
             A.entries[nnz].row = globalIds[eM*mesh.Np + n];
@@ -800,7 +776,7 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad2D(parAlmond::parCOO& A){
             ++nnz;
           }
         }
-        if(fabs(Anm)>tol){
+        if(std::abs(Anm)>tol){
           // local block
           A.entries[nnz].row = globalIds[eM*mesh.Np+n];
           A.entries[nnz].col = globalIds[eM*mesh.Np+m];
@@ -812,29 +788,24 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad2D(parAlmond::parCOO& A){
   }
 
   // sort received non-zero entries by row block
-  std::sort(A.entries, A.entries+nnz,
-            [](const parAlmond::parCOO::nonZero_t& a,
-               const parAlmond::parCOO::nonZero_t& b) {
-              if (a.row < b.row) return true;
-              if (a.row > b.row) return false;
+  sort(A.entries.ptr(), A.entries.ptr()+nnz,
+      [](const parAlmond::parCOO::nonZero_t& a,
+         const parAlmond::parCOO::nonZero_t& b) {
+        if (a.row < b.row) return true;
+        if (a.row > b.row) return false;
 
-              return a.col < b.col;
-            });
+        return a.col < b.col;
+      });
 
   //*A = (parAlmond::parCOO::nonZero_t*) realloc(*A, nnz*sizeof(parAlmond::parCOO::nonZero_t));
   A.nnz = nnz;
 
-  if(rankM==0) printf("done.\n");
-
-  free(globalIds);
-  free(B);  free(Br); free(Bs);
+  if(Comm::World().rank()==0) printf("done.\n");
 }
 
 
 void elliptic_t::BuildOperatorMatrixIpdgQuad3D(parAlmond::parCOO& A){
 
-  int rankM = mesh.rank;
-
   int Np = mesh.Np;
   int Nfaces = mesh.Nfaces;
   dlong Nelements = mesh.Nelements;
@@ -842,34 +813,31 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad3D(parAlmond::parCOO& A){
   hlong Nnum = mesh.Np*mesh.Nelements;
 
   // create a global numbering system
-  hlong *globalIds = (hlong *) calloc((Nelements+mesh.totalHaloPairs)*Np,sizeof(hlong));
+  memory<hlong> globalIds((Nelements+mesh.totalHaloPairs)*Np);
 
   // every degree of freedom has its own global id
-  A.globalRowStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong));
-  A.globalColStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong));
-  MPI_Allgather(&Nnum, 1, MPI_HLONG, A.globalRowStarts+1, 1, MPI_HLONG, mesh.comm);
+  A.globalRowStarts.malloc(mesh.size+1,0);
+  A.globalColStarts.malloc(mesh.size+1,0);
+  mesh.comm.Allgather(Nnum, A.globalRowStarts+1);
   for(int r=0;r<mesh.size;++r) {
     A.globalRowStarts[r+1] = A.globalRowStarts[r]+A.globalRowStarts[r+1];
     A.globalColStarts[r+1] = A.globalRowStarts[r+1];
   }
 
   /* so find number of elements on each rank */
-  dlong *rankNelements = (dlong*) calloc(mesh.size, sizeof(dlong));
-  hlong *rankStarts = (hlong*) calloc(mesh.size+1, sizeof(hlong));
-  MPI_Allgather(&Nelements, 1, MPI_DLONG, rankNelements, 1, MPI_DLONG, mesh.comm);
-  //find offsets
-  for(int r=0;r<mesh.size;++r){
-    rankStarts[r+1] = rankStarts[r]+rankNelements[r];
-  }
+  hlong gNelements = Nelements;
+  hlong globalElementOffset = Nelements;
+  mesh.comm.Scan(gNelements, globalElementOffset);
+  globalElementOffset = globalElementOffset - Nelements;
   //use the offsets to set a global id
-  for (dlong e =0;e<Nelements;e++) {
+  for (dlong e=0;e<Nelements;e++) {
     for (int n=0;n<Np;n++) {
-      globalIds[e*Np +n] = n + (e + rankStarts[rankM])*Np;
+      globalIds[e*Np + n] = n + (e + globalElementOffset)*Np;
     }
   }
 
   /* do a halo exchange of global node numbers */
-  mesh.halo->Exchange(globalIds, Np, ogs_hlong);
+  mesh.halo.Exchange(globalIds, Np);
 
   dlong nnzLocalBound = Np*Np*(1+Nfaces)*Nelements;
 
@@ -877,9 +845,9 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad3D(parAlmond::parCOO& A){
   dfloat tol = 1e-8;
 
   // build some monolithic basis arrays (use Dr,Ds,Dt and insert MM instead of weights for tet version)
-  dfloat *B  = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat));
-  dfloat *Br = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat));
-  dfloat *Bs = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat));
+  memory<dfloat> B (mesh.Np*mesh.Np, 0.0);
+  memory<dfloat> Br(mesh.Np*mesh.Np, 0.0);
+  memory<dfloat> Bs(mesh.Np*mesh.Np, 0.0);
 
   int mode = 0;
   for(int nj=0;nj<mesh.N+1;++nj){
@@ -905,9 +873,9 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad3D(parAlmond::parCOO& A){
     }
   }
 
-  A.entries = (parAlmond::parCOO::nonZero_t*) calloc(nnzLocalBound,sizeof(parAlmond::parCOO::nonZero_t));
+  A.entries.malloc(nnzLocalBound);
 
-  if(rankM==0) {printf("Building full IPDG matrix...");fflush(stdout);}
+  if(Comm::World().rank()==0) {printf("Building full IPDG matrix...");fflush(stdout);}
 
   // reset non-zero counter
   dlong nnz = 0;
@@ -923,16 +891,16 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad3D(parAlmond::parCOO& A){
         // (grad phi_n, grad phi_m)_{D^e}
         for(int i=0;i<mesh.Np;++i){
           dlong base = eM*mesh.Np*mesh.Nvgeo + i;
-          dfloat drdx = mesh.vgeo[base+mesh.Np*RXID];
-          dfloat drdy = mesh.vgeo[base+mesh.Np*RYID];
-          dfloat drdz = mesh.vgeo[base+mesh.Np*RZID];
-          dfloat dsdx = mesh.vgeo[base+mesh.Np*SXID];
-          dfloat dsdy = mesh.vgeo[base+mesh.Np*SYID];
-          dfloat dsdz = mesh.vgeo[base+mesh.Np*SZID];
-          // dfloat dtdx = mesh.vgeo[base+mesh.Np*TXID];
-          // dfloat dtdy = mesh.vgeo[base+mesh.Np*TYID];
-          // dfloat dtdz = mesh.vgeo[base+mesh.Np*TZID];
-          dfloat JW   = mesh.vgeo[base+mesh.Np*JWID];
+          dfloat drdx = mesh.vgeo[base+mesh.Np*mesh.RXID];
+          dfloat drdy = mesh.vgeo[base+mesh.Np*mesh.RYID];
+          dfloat drdz = mesh.vgeo[base+mesh.Np*mesh.RZID];
+          dfloat dsdx = mesh.vgeo[base+mesh.Np*mesh.SXID];
+          dfloat dsdy = mesh.vgeo[base+mesh.Np*mesh.SYID];
+          dfloat dsdz = mesh.vgeo[base+mesh.Np*mesh.SZID];
+          // dfloat dtdx = mesh.vgeo[base+mesh.Np*mesh.TXID];
+          // dfloat dtdy = mesh.vgeo[base+mesh.Np*mesh.TYID];
+          // dfloat dtdz = mesh.vgeo[base+mesh.Np*mesh.TZID];
+          dfloat JW   = mesh.vgeo[base+mesh.Np*mesh.JWID];
 
           int idn = n*mesh.Np+i;
           int idm = m*mesh.Np+i;
@@ -957,17 +925,17 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad3D(parAlmond::parCOO& A){
 
             // grab vol geofacs at surface nodes
             dlong baseM = eM*mesh.Np*mesh.Nvgeo + vidM;
-            dfloat drdxM = mesh.vgeo[baseM+mesh.Np*RXID];
-            dfloat drdyM = mesh.vgeo[baseM+mesh.Np*RYID];
-            dfloat drdzM = mesh.vgeo[baseM+mesh.Np*RZID];
+            dfloat drdxM = mesh.vgeo[baseM+mesh.Np*mesh.RXID];
+            dfloat drdyM = mesh.vgeo[baseM+mesh.Np*mesh.RYID];
+            dfloat drdzM = mesh.vgeo[baseM+mesh.Np*mesh.RZID];
 
-            dfloat dsdxM = mesh.vgeo[baseM+mesh.Np*SXID];
-            dfloat dsdyM = mesh.vgeo[baseM+mesh.Np*SYID];
-            dfloat dsdzM = mesh.vgeo[baseM+mesh.Np*SZID];
+            dfloat dsdxM = mesh.vgeo[baseM+mesh.Np*mesh.SXID];
+            dfloat dsdyM = mesh.vgeo[baseM+mesh.Np*mesh.SYID];
+            dfloat dsdzM = mesh.vgeo[baseM+mesh.Np*mesh.SZID];
 
-            // dfloat dtdxM = mesh.vgeo[baseM+mesh.Np*TXID];
-            // dfloat dtdyM = mesh.vgeo[baseM+mesh.Np*TYID];
-            // dfloat dtdzM = mesh.vgeo[baseM+mesh.Np*TZID];
+            // dfloat dtdxM = mesh.vgeo[baseM+mesh.Np*mesh.TXID];
+            // dfloat dtdyM = mesh.vgeo[baseM+mesh.Np*mesh.TYID];
+            // dfloat dtdzM = mesh.vgeo[baseM+mesh.Np*mesh.TZID];
 
 
             // double check vol geometric factors are in halo storage of vgeo
@@ -975,25 +943,25 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad3D(parAlmond::parCOO& A){
             int vidP      = (int) (mesh.vmapP[idM]%mesh.Np); // only use this to identify location of positive trace vgeo
             dlong localEP = mesh.vmapP[idM]/mesh.Np;
             dlong baseP   = localEP*mesh.Np*mesh.Nvgeo + vidP; // use local offset for vgeo in halo
-            dfloat drdxP = mesh.vgeo[baseP+mesh.Np*RXID];
-            dfloat drdyP = mesh.vgeo[baseP+mesh.Np*RYID];
-            dfloat drdzP = mesh.vgeo[baseP+mesh.Np*RZID];
+            dfloat drdxP = mesh.vgeo[baseP+mesh.Np*mesh.RXID];
+            dfloat drdyP = mesh.vgeo[baseP+mesh.Np*mesh.RYID];
+            dfloat drdzP = mesh.vgeo[baseP+mesh.Np*mesh.RZID];
 
-            dfloat dsdxP = mesh.vgeo[baseP+mesh.Np*SXID];
-            dfloat dsdyP = mesh.vgeo[baseP+mesh.Np*SYID];
-            dfloat dsdzP = mesh.vgeo[baseP+mesh.Np*SZID];
+            dfloat dsdxP = mesh.vgeo[baseP+mesh.Np*mesh.SXID];
+            dfloat dsdyP = mesh.vgeo[baseP+mesh.Np*mesh.SYID];
+            dfloat dsdzP = mesh.vgeo[baseP+mesh.Np*mesh.SZID];
 
-            // dfloat dtdxP = mesh.vgeo[baseP+mesh.Np*TXID];
-            // dfloat dtdyP = mesh.vgeo[baseP+mesh.Np*TYID];
-            // dfloat dtdzP = mesh.vgeo[baseP+mesh.Np*TZID];
+            // dfloat dtdxP = mesh.vgeo[baseP+mesh.Np*mesh.TXID];
+            // dfloat dtdyP = mesh.vgeo[baseP+mesh.Np*mesh.TYID];
+            // dfloat dtdzP = mesh.vgeo[baseP+mesh.Np*mesh.TZID];
 
             // grab surface geometric factors
             dlong base = mesh.Nsgeo*(eM*mesh.Nfp*mesh.Nfaces + fM*mesh.Nfp + i);
-            dfloat nx = mesh.sgeo[base+NXID];
-            dfloat ny = mesh.sgeo[base+NYID];
-            dfloat nz = mesh.sgeo[base+NZID];
-            dfloat wsJ = mesh.sgeo[base+WSJID];
-            dfloat hinv = mesh.sgeo[base+IHID];
+            dfloat nx = mesh.sgeo[base+mesh.NXID];
+            dfloat ny = mesh.sgeo[base+mesh.NYID];
+            dfloat nz = mesh.sgeo[base+mesh.NZID];
+            dfloat wsJ = mesh.sgeo[base+mesh.WSJID];
+            dfloat hinv = mesh.sgeo[base+mesh.IHID];
 
             // form negative trace terms in IPDG
             int idnM = n*mesh.Np+vidM;
@@ -1029,7 +997,7 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad3D(parAlmond::parCOO& A){
             AnmP += +0.5*wsJ*ndotgradlnM*lmP;  // +(N.grad ln^-, lm^+)
             AnmP += -0.5*wsJ*penalty*lnM*lmP; // -((tau/h)*ln^-,lm^+)
           }
-          if(fabs(AnmP)>tol){
+          if(std::abs(AnmP)>tol){
             // remote info
             dlong eP    = mesh.EToE[eM*mesh.Nfaces+fM];
             A.entries[nnz].row = globalIds[eM*mesh.Np + n];
@@ -1039,7 +1007,7 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad3D(parAlmond::parCOO& A){
           }
         }
 
-        if(fabs(Anm)>tol){
+        if(std::abs(Anm)>tol){
           // local block
           A.entries[nnz].row = globalIds[eM*mesh.Np+n];
           A.entries[nnz].col = globalIds[eM*mesh.Np+m];
@@ -1051,19 +1019,19 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad3D(parAlmond::parCOO& A){
   }
 
   // sort received non-zero entries by row block
-  std::sort(A.entries, A.entries+nnz,
-            [](const parAlmond::parCOO::nonZero_t& a,
-               const parAlmond::parCOO::nonZero_t& b) {
-              if (a.row < b.row) return true;
-              if (a.row > b.row) return false;
+  sort(A.entries.ptr(), A.entries.ptr()+nnz,
+      [](const parAlmond::parCOO::nonZero_t& a,
+         const parAlmond::parCOO::nonZero_t& b) {
+        if (a.row < b.row) return true;
+        if (a.row > b.row) return false;
 
-              return a.col < b.col;
-            });
+        return a.col < b.col;
+      });
 
   //*A = (parAlmond::parCOO::nonZero_t*) realloc(*A, nnz*sizeof(parAlmond::parCOO::nonZero_t));
   A.nnz = nnz;
 
-  if(rankM==0) printf("done.\n");
+  if(Comm::World().rank()==0) printf("done.\n");
 
 #if 0
   {
@@ -1077,9 +1045,6 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad3D(parAlmond::parCOO& A){
     fclose(fp);
   }
 #endif
-
-  free(globalIds);
-  free(B);  free(Br); free(Bs);
 }
 
 
@@ -1089,41 +1054,35 @@ void elliptic_t::BuildOperatorMatrixIpdgQuad3D(parAlmond::parCOO& A){
 
 void elliptic_t::BuildOperatorMatrixIpdgTet3D(parAlmond::parCOO& A){
 
-  int rankM = mesh.rank;
-
   // number of degrees of freedom on this rank
   hlong Nnum = mesh.Np*mesh.Nelements;
 
   // create a global numbering system
-  hlong *globalIds = (hlong *) calloc((mesh.Nelements+mesh.totalHaloPairs)*mesh.Np,sizeof(hlong));
+  memory<hlong> globalIds((mesh.Nelements+mesh.totalHaloPairs)*mesh.Np);
 
   // every degree of freedom has its own global id
-  A.globalRowStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong));
-  A.globalColStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong));
-  MPI_Allgather(&Nnum, 1, MPI_HLONG, A.globalRowStarts+1, 1, MPI_HLONG, mesh.comm);
+  A.globalRowStarts.malloc(mesh.size+1,0);
+  A.globalColStarts.malloc(mesh.size+1,0);
+  mesh.comm.Allgather(Nnum, A.globalRowStarts+1);
   for(int r=0;r<mesh.size;++r) {
     A.globalRowStarts[r+1] = A.globalRowStarts[r]+A.globalRowStarts[r+1];
     A.globalColStarts[r+1] = A.globalRowStarts[r+1];
   }
 
   /* so find number of elements on each rank */
-  dlong *rankNelements = (dlong*) calloc(mesh.size, sizeof(dlong));
-  hlong *rankStarts = (hlong*) calloc(mesh.size+1, sizeof(hlong));
-  MPI_Allgather(&(mesh.Nelements), 1, MPI_DLONG,
-                    rankNelements, 1, MPI_DLONG, mesh.comm);
-  //find offsets
-  for(int r=0;r<mesh.size;++r){
-    rankStarts[r+1] = rankStarts[r]+rankNelements[r];
-  }
+  hlong gNelements = mesh.Nelements;
+  hlong globalElementOffset = mesh.Nelements;
+  mesh.comm.Scan(gNelements, globalElementOffset);
+  globalElementOffset = globalElementOffset - mesh.Nelements;
   //use the offsets to set a global id
-  for (dlong e =0;e<mesh.Nelements;e++) {
+  for (dlong e=0;e<mesh.Nelements;e++) {
     for (int n=0;n<mesh.Np;n++) {
-      globalIds[e*mesh.Np +n] = n + (e + rankStarts[rankM])*mesh.Np;
+      globalIds[e*mesh.Np + n] = n + (e + globalElementOffset)*mesh.Np;
     }
   }
 
   /* do a halo exchange of global node numbers */
-  mesh.halo->Exchange(globalIds, mesh.Np, ogs_hlong);
+  mesh.halo.Exchange(globalIds, mesh.Np);
 
   dlong nnzLocalBound = mesh.Np*mesh.Np*(1+mesh.Nfaces)*mesh.Nelements;
 
@@ -1131,7 +1090,7 @@ void elliptic_t::BuildOperatorMatrixIpdgTet3D(parAlmond::parCOO& A){
   dfloat tol = 1e-8;
 
   // surface mass matrices MS = MM*LIFT
-  dfloat *MS = (dfloat *) calloc(mesh.Nfaces*mesh.Np*mesh.Nfp,sizeof(dfloat));
+  memory<dfloat> MS(mesh.Nfaces*mesh.Np*mesh.Nfp);
   for (int f=0;f<mesh.Nfaces;f++) {
     for (int n=0;n<mesh.Np;n++) {
       for (int m=0;m<mesh.Nfp;m++) {
@@ -1145,9 +1104,9 @@ void elliptic_t::BuildOperatorMatrixIpdgTet3D(parAlmond::parCOO& A){
   }
 
   // DrT*MS, DsT*MS, DtT*MS
-  dfloat *DrTMS = (dfloat *) calloc(mesh.Nfaces*mesh.Np*mesh.Nfp,sizeof(dfloat));
-  dfloat *DsTMS = (dfloat *) calloc(mesh.Nfaces*mesh.Np*mesh.Nfp,sizeof(dfloat));
-  dfloat *DtTMS = (dfloat *) calloc(mesh.Nfaces*mesh.Np*mesh.Nfp,sizeof(dfloat));
+  memory<dfloat> DrTMS(mesh.Nfaces*mesh.Np*mesh.Nfp);
+  memory<dfloat> DsTMS(mesh.Nfaces*mesh.Np*mesh.Nfp);
+  memory<dfloat> DtTMS(mesh.Nfaces*mesh.Np*mesh.Nfp);
   for (int f=0;f<mesh.Nfaces;f++) {
     for (int n=0;n<mesh.Np;n++) {
       for (int i=0;i<mesh.Nfp;i++) {
@@ -1166,35 +1125,35 @@ void elliptic_t::BuildOperatorMatrixIpdgTet3D(parAlmond::parCOO& A){
     }
   }
 
-  A.entries = (parAlmond::parCOO::nonZero_t*) calloc(nnzLocalBound,sizeof(parAlmond::parCOO::nonZero_t));
+  A.entries.malloc(nnzLocalBound);
 
   // reset non-zero counter
   dlong nnz = 0;
 
-  if(rankM==0) {printf("Building full IPDG matrix...");fflush(stdout);}
+  if(Comm::World().rank()==0) {printf("Building full IPDG matrix...");fflush(stdout);}
 
   // loop over all elements
   //#pragma omp parallel
 {
 
-  dfloat *BM = (dfloat *) calloc(mesh.Np*mesh.Np,sizeof(dfloat));
+  memory<dfloat> BM(mesh.Np*mesh.Np);
 
-  dfloat *qmP = (dfloat *) calloc(mesh.Nfp,sizeof(dfloat));
-  dfloat *qmM = (dfloat *) calloc(mesh.Nfp,sizeof(dfloat));
-  dfloat *ndotgradqmM = (dfloat *) calloc(mesh.Nfp,sizeof(dfloat));
-  dfloat *ndotgradqmP = (dfloat *) calloc(mesh.Nfp,sizeof(dfloat));
+  memory<dfloat> qmP(mesh.Nfp);
+  memory<dfloat> qmM(mesh.Nfp);
+  memory<dfloat> ndotgradqmM(mesh.Nfp);
+  memory<dfloat> ndotgradqmP(mesh.Nfp);
 
   //#pragma omp for
   for(dlong eM=0;eM<mesh.Nelements;++eM){
 
     dlong gbase = eM*mesh.Nggeo;
-    dfloat Grr = mesh.ggeo[gbase+G00ID];
-    dfloat Grs = mesh.ggeo[gbase+G01ID];
-    dfloat Grt = mesh.ggeo[gbase+G02ID];
-    dfloat Gss = mesh.ggeo[gbase+G11ID];
-    dfloat Gst = mesh.ggeo[gbase+G12ID];
-    dfloat Gtt = mesh.ggeo[gbase+G22ID];
-    dfloat J   = mesh.ggeo[gbase+GWJID];
+    dfloat Grr = mesh.ggeo[gbase+mesh.G00ID];
+    dfloat Grs = mesh.ggeo[gbase+mesh.G01ID];
+    dfloat Grt = mesh.ggeo[gbase+mesh.G02ID];
+    dfloat Gss = mesh.ggeo[gbase+mesh.G11ID];
+    dfloat Gst = mesh.ggeo[gbase+mesh.G12ID];
+    dfloat Gtt = mesh.ggeo[gbase+mesh.G22ID];
+    dfloat J   = mesh.wJ[eM];
 
     /* start with stiffness matrix  */
     for(int n=0;n<mesh.Np;++n){
@@ -1210,38 +1169,38 @@ void elliptic_t::BuildOperatorMatrixIpdgTet3D(parAlmond::parCOO& A){
     }
 
     dlong vbase = eM*mesh.Nvgeo;
-    dfloat drdx = mesh.vgeo[vbase+RXID];
-    dfloat drdy = mesh.vgeo[vbase+RYID];
-    dfloat drdz = mesh.vgeo[vbase+RZID];
-    dfloat dsdx = mesh.vgeo[vbase+SXID];
-    dfloat dsdy = mesh.vgeo[vbase+SYID];
-    dfloat dsdz = mesh.vgeo[vbase+SZID];
-    dfloat dtdx = mesh.vgeo[vbase+TXID];
-    dfloat dtdy = mesh.vgeo[vbase+TYID];
-    dfloat dtdz = mesh.vgeo[vbase+TZID];
+    dfloat drdx = mesh.vgeo[vbase+mesh.RXID];
+    dfloat drdy = mesh.vgeo[vbase+mesh.RYID];
+    dfloat drdz = mesh.vgeo[vbase+mesh.RZID];
+    dfloat dsdx = mesh.vgeo[vbase+mesh.SXID];
+    dfloat dsdy = mesh.vgeo[vbase+mesh.SYID];
+    dfloat dsdz = mesh.vgeo[vbase+mesh.SZID];
+    dfloat dtdx = mesh.vgeo[vbase+mesh.TXID];
+    dfloat dtdy = mesh.vgeo[vbase+mesh.TYID];
+    dfloat dtdz = mesh.vgeo[vbase+mesh.TZID];
 
     for (int m=0;m<mesh.Np;m++) {
       for (int fM=0;fM<mesh.Nfaces;fM++) {
         // load surface geofactors for this face
         dlong sid = mesh.Nsgeo*(eM*mesh.Nfaces+fM);
-        dfloat nx = mesh.sgeo[sid+NXID];
-        dfloat ny = mesh.sgeo[sid+NYID];
-        dfloat nz = mesh.sgeo[sid+NZID];
-        dfloat sJ = mesh.sgeo[sid+SJID];
-        dfloat hinv = mesh.sgeo[sid+IHID];
+        dfloat nx = mesh.sgeo[sid+mesh.NXID];
+        dfloat ny = mesh.sgeo[sid+mesh.NYID];
+        dfloat nz = mesh.sgeo[sid+mesh.NZID];
+        dfloat sJ = mesh.sgeo[sid+mesh.SJID];
+        dfloat hinv = mesh.sgeo[sid+mesh.IHID];
 
         dlong eP = mesh.EToE[eM*mesh.Nfaces+fM];
         if (eP < 0) eP = eM;
         dlong vbaseP = eP*mesh.Nvgeo;
-        dfloat drdxP = mesh.vgeo[vbaseP+RXID];
-        dfloat drdyP = mesh.vgeo[vbaseP+RYID];
-        dfloat drdzP = mesh.vgeo[vbaseP+RZID];
-        dfloat dsdxP = mesh.vgeo[vbaseP+SXID];
-        dfloat dsdyP = mesh.vgeo[vbaseP+SYID];
-        dfloat dsdzP = mesh.vgeo[vbaseP+SZID];
-        dfloat dtdxP = mesh.vgeo[vbaseP+TXID];
-        dfloat dtdyP = mesh.vgeo[vbaseP+TYID];
-        dfloat dtdzP = mesh.vgeo[vbaseP+TZID];
+        dfloat drdxP = mesh.vgeo[vbaseP+mesh.RXID];
+        dfloat drdyP = mesh.vgeo[vbaseP+mesh.RYID];
+        dfloat drdzP = mesh.vgeo[vbaseP+mesh.RZID];
+        dfloat dsdxP = mesh.vgeo[vbaseP+mesh.SXID];
+        dfloat dsdyP = mesh.vgeo[vbaseP+mesh.SYID];
+        dfloat dsdzP = mesh.vgeo[vbaseP+mesh.SZID];
+        dfloat dtdxP = mesh.vgeo[vbaseP+mesh.TXID];
+        dfloat dtdyP = mesh.vgeo[vbaseP+mesh.TYID];
+        dfloat dtdzP = mesh.vgeo[vbaseP+mesh.TZID];
 
         // extract trace nodes
         for (int i=0;i<mesh.Nfp;i++) {
@@ -1307,7 +1266,7 @@ void elliptic_t::BuildOperatorMatrixIpdgTet3D(parAlmond::parCOO& A){
             }
           }
 
-          if(fabs(AnmP)>tol){
+          if(std::abs(AnmP)>tol){
             //#pragma omp critical
             {
               // remote info
@@ -1325,7 +1284,7 @@ void elliptic_t::BuildOperatorMatrixIpdgTet3D(parAlmond::parCOO& A){
       for (int m=0;m<mesh.Np;m++) {
         dfloat Anm = BM[m+n*mesh.Np];
 
-        if(fabs(Anm)>tol){
+        if(std::abs(Anm)>tol){
           //#pragma omp critical
           {
             A.entries[nnz].row = globalIds[eM*mesh.Np+n];
@@ -1337,36 +1296,25 @@ void elliptic_t::BuildOperatorMatrixIpdgTet3D(parAlmond::parCOO& A){
       }
     }
   }
-
-  free(BM);
-  free(qmM); free(qmP);
-  free(ndotgradqmM); free(ndotgradqmP);
 }
 
-  std::sort(A.entries, A.entries+nnz,
-            [](const parAlmond::parCOO::nonZero_t& a,
-               const parAlmond::parCOO::nonZero_t& b) {
-              if (a.row < b.row) return true;
-              if (a.row > b.row) return false;
+  sort(A.entries.ptr(), A.entries.ptr()+nnz,
+      [](const parAlmond::parCOO::nonZero_t& a,
+         const parAlmond::parCOO::nonZero_t& b) {
+        if (a.row < b.row) return true;
+        if (a.row > b.row) return false;
 
-              return a.col < b.col;
-            });
+        return a.col < b.col;
+      });
   // free up unused storage
   //*A = (parAlmond::parCOO::nonZero_t*) realloc(*A, nnz*sizeof(parAlmond::parCOO::nonZero_t));
   A.nnz = nnz;
 
-  if(rankM==0) printf("done.\n");
-
-  free(globalIds);
-
-  free(MS);
-  free(DrTMS); free(DsTMS); free(DtTMS);
+  if(Comm::World().rank()==0) printf("done.\n");
 }
 
 void elliptic_t::BuildOperatorMatrixIpdgHex3D(parAlmond::parCOO& A){
 
-  int rankM = mesh.rank;
-
   int Np = mesh.Np;
   int Nfaces = mesh.Nfaces;
   dlong Nelements = mesh.Nelements;
@@ -1374,35 +1322,31 @@ void elliptic_t::BuildOperatorMatrixIpdgHex3D(parAlmond::parCOO& A){
   hlong Nnum = mesh.Np*mesh.Nelements;
 
   // create a global numbering system
-  hlong *globalIds = (hlong *) calloc((Nelements+mesh.totalHaloPairs)*Np,sizeof(hlong));
+  memory<hlong> globalIds((Nelements+mesh.totalHaloPairs)*Np);
 
   // every degree of freedom has its own global id
-  A.globalRowStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong));
-  A.globalColStarts = (hlong*) calloc(mesh.size+1,sizeof(hlong));
-  MPI_Allgather(&Nnum, 1, MPI_HLONG, A.globalRowStarts+1, 1, MPI_HLONG, mesh.comm);
+  A.globalRowStarts.malloc(mesh.size+1,0);
+  A.globalColStarts.malloc(mesh.size+1,0);
+  mesh.comm.Allgather(Nnum, A.globalRowStarts+1);
   for(int r=0;r<mesh.size;++r) {
     A.globalRowStarts[r+1] = A.globalRowStarts[r]+A.globalRowStarts[r+1];
     A.globalColStarts[r+1] = A.globalRowStarts[r+1];
   }
 
   /* so find number of elements on each rank */
-  dlong *rankNelements = (dlong*) calloc(mesh.size, sizeof(dlong));
-  hlong *rankStarts = (hlong*) calloc(mesh.size+1, sizeof(hlong));
-  MPI_Allgather(&Nelements, 1, MPI_DLONG,
-    rankNelements, 1, MPI_DLONG, mesh.comm);
-  //find offsets
-  for(int r=0;r<mesh.size;++r){
-    rankStarts[r+1] = rankStarts[r]+rankNelements[r];
-  }
+  hlong gNelements = Nelements;
+  hlong globalElementOffset = Nelements;
+  mesh.comm.Scan(gNelements, globalElementOffset);
+  globalElementOffset = globalElementOffset - Nelements;
   //use the offsets to set a global id
-  for (dlong e =0;e<Nelements;e++) {
+  for (dlong e=0;e<Nelements;e++) {
     for (int n=0;n<Np;n++) {
-      globalIds[e*Np +n] = n + (e + rankStarts[rankM])*Np;
+      globalIds[e*Np + n] = n + (e + globalElementOffset)*Np;
     }
   }
 
   /* do a halo exchange of global node numbers */
-  mesh.halo->Exchange(globalIds, Np, ogs_hlong);
+  mesh.halo.Exchange(globalIds, Np);
 
   dlong nnzLocalBound = Np*Np*(1+Nfaces)*Nelements;
 
@@ -1410,10 +1354,10 @@ void elliptic_t::BuildOperatorMatrixIpdgHex3D(parAlmond::parCOO& A){
   dfloat tol = 1e-8;
 
   // build some monolithic basis arrays (use Dr,Ds,Dt and insert MM instead of weights for tet version)
-  dfloat *B  = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat));
-  dfloat *Br = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat));
-  dfloat *Bs = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat));
-  dfloat *Bt = (dfloat*) calloc(mesh.Np*mesh.Np, sizeof(dfloat));
+  memory<dfloat> B (mesh.Np*mesh.Np, 0.0);
+  memory<dfloat> Br(mesh.Np*mesh.Np, 0.0);
+  memory<dfloat> Bs(mesh.Np*mesh.Np, 0.0);
+  memory<dfloat> Bt(mesh.Np*mesh.Np, 0.0);
 
   int mode = 0;
   for(int nk=0;nk<mesh.N+1;++nk){
@@ -1445,9 +1389,9 @@ void elliptic_t::BuildOperatorMatrixIpdgHex3D(parAlmond::parCOO& A){
     }
   }
 
-  A.entries = (parAlmond::parCOO::nonZero_t*) calloc(nnzLocalBound,sizeof(parAlmond::parCOO::nonZero_t));
+  A.entries.malloc(nnzLocalBound);
 
-  if(rankM==0) {printf("Building full IPDG matrix...");fflush(stdout);}
+  if(Comm::World().rank()==0) {printf("Building full IPDG matrix...");fflush(stdout);}
 
   // reset non-zero counter
   dlong nnz = 0;
@@ -1464,16 +1408,16 @@ void elliptic_t::BuildOperatorMatrixIpdgHex3D(parAlmond::parCOO& A){
         // (grad phi_n, grad phi_m)_{D^e}
         for(int i=0;i<mesh.Np;++i){
           dlong base = eM*mesh.Np*mesh.Nvgeo + i;
-          dfloat drdx = mesh.vgeo[base+mesh.Np*RXID];
-          dfloat drdy = mesh.vgeo[base+mesh.Np*RYID];
-          dfloat drdz = mesh.vgeo[base+mesh.Np*RZID];
-          dfloat dsdx = mesh.vgeo[base+mesh.Np*SXID];
-          dfloat dsdy = mesh.vgeo[base+mesh.Np*SYID];
-          dfloat dsdz = mesh.vgeo[base+mesh.Np*SZID];
-          dfloat dtdx = mesh.vgeo[base+mesh.Np*TXID];
-          dfloat dtdy = mesh.vgeo[base+mesh.Np*TYID];
-          dfloat dtdz = mesh.vgeo[base+mesh.Np*TZID];
-          dfloat JW   = mesh.vgeo[base+mesh.Np*JWID];
+          dfloat drdx = mesh.vgeo[base+mesh.Np*mesh.RXID];
+          dfloat drdy = mesh.vgeo[base+mesh.Np*mesh.RYID];
+          dfloat drdz = mesh.vgeo[base+mesh.Np*mesh.RZID];
+          dfloat dsdx = mesh.vgeo[base+mesh.Np*mesh.SXID];
+          dfloat dsdy = mesh.vgeo[base+mesh.Np*mesh.SYID];
+          dfloat dsdz = mesh.vgeo[base+mesh.Np*mesh.SZID];
+          dfloat dtdx = mesh.vgeo[base+mesh.Np*mesh.TXID];
+          dfloat dtdy = mesh.vgeo[base+mesh.Np*mesh.TYID];
+          dfloat dtdz = mesh.vgeo[base+mesh.Np*mesh.TZID];
+          dfloat JW   = mesh.vgeo[base+mesh.Np*mesh.JWID];
 
           int idn = n*mesh.Np+i;
           int idm = m*mesh.Np+i;
@@ -1496,38 +1440,38 @@ void elliptic_t::BuildOperatorMatrixIpdgHex3D(parAlmond::parCOO& A){
 
             // grab vol geofacs at surface nodes
             dlong baseM = eM*mesh.Np*mesh.Nvgeo + vidM;
-            dfloat drdxM = mesh.vgeo[baseM+mesh.Np*RXID];
-            dfloat drdyM = mesh.vgeo[baseM+mesh.Np*RYID];
-            dfloat drdzM = mesh.vgeo[baseM+mesh.Np*RZID];
-            dfloat dsdxM = mesh.vgeo[baseM+mesh.Np*SXID];
-            dfloat dsdyM = mesh.vgeo[baseM+mesh.Np*SYID];
-            dfloat dsdzM = mesh.vgeo[baseM+mesh.Np*SZID];
-            dfloat dtdxM = mesh.vgeo[baseM+mesh.Np*TXID];
-            dfloat dtdyM = mesh.vgeo[baseM+mesh.Np*TYID];
-            dfloat dtdzM = mesh.vgeo[baseM+mesh.Np*TZID];
+            dfloat drdxM = mesh.vgeo[baseM+mesh.Np*mesh.RXID];
+            dfloat drdyM = mesh.vgeo[baseM+mesh.Np*mesh.RYID];
+            dfloat drdzM = mesh.vgeo[baseM+mesh.Np*mesh.RZID];
+            dfloat dsdxM = mesh.vgeo[baseM+mesh.Np*mesh.SXID];
+            dfloat dsdyM = mesh.vgeo[baseM+mesh.Np*mesh.SYID];
+            dfloat dsdzM = mesh.vgeo[baseM+mesh.Np*mesh.SZID];
+            dfloat dtdxM = mesh.vgeo[baseM+mesh.Np*mesh.TXID];
+            dfloat dtdyM = mesh.vgeo[baseM+mesh.Np*mesh.TYID];
+            dfloat dtdzM = mesh.vgeo[baseM+mesh.Np*mesh.TZID];
 
             // double check vol geometric factors are in halo storage of vgeo
             dlong idM     = eM*mesh.Nfp*mesh.Nfaces+fM*mesh.Nfp+i;
             int vidP    = (int) (mesh.vmapP[idM]%mesh.Np); // only use this to identify location of positive trace vgeo
             dlong localEP = mesh.vmapP[idM]/mesh.Np;
             dlong baseP   = localEP*mesh.Np*mesh.Nvgeo + vidP; // use local offset for vgeo in halo
-            dfloat drdxP = mesh.vgeo[baseP+mesh.Np*RXID];
-            dfloat drdyP = mesh.vgeo[baseP+mesh.Np*RYID];
-            dfloat drdzP = mesh.vgeo[baseP+mesh.Np*RZID];
-            dfloat dsdxP = mesh.vgeo[baseP+mesh.Np*SXID];
-            dfloat dsdyP = mesh.vgeo[baseP+mesh.Np*SYID];
-            dfloat dsdzP = mesh.vgeo[baseP+mesh.Np*SZID];
-            dfloat dtdxP = mesh.vgeo[baseP+mesh.Np*TXID];
-            dfloat dtdyP = mesh.vgeo[baseP+mesh.Np*TYID];
-            dfloat dtdzP = mesh.vgeo[baseP+mesh.Np*TZID];
+            dfloat drdxP = mesh.vgeo[baseP+mesh.Np*mesh.RXID];
+            dfloat drdyP = mesh.vgeo[baseP+mesh.Np*mesh.RYID];
+            dfloat drdzP = mesh.vgeo[baseP+mesh.Np*mesh.RZID];
+            dfloat dsdxP = mesh.vgeo[baseP+mesh.Np*mesh.SXID];
+            dfloat dsdyP = mesh.vgeo[baseP+mesh.Np*mesh.SYID];
+            dfloat dsdzP = mesh.vgeo[baseP+mesh.Np*mesh.SZID];
+            dfloat dtdxP = mesh.vgeo[baseP+mesh.Np*mesh.TXID];
+            dfloat dtdyP = mesh.vgeo[baseP+mesh.Np*mesh.TYID];
+            dfloat dtdzP = mesh.vgeo[baseP+mesh.Np*mesh.TZID];
 
             // grab surface geometric factors
             dlong base = mesh.Nsgeo*(eM*mesh.Nfp*mesh.Nfaces + fM*mesh.Nfp + i);
-            dfloat nx = mesh.sgeo[base+NXID];
-            dfloat ny = mesh.sgeo[base+NYID];
-            dfloat nz = mesh.sgeo[base+NZID];
-            dfloat wsJ = mesh.sgeo[base+WSJID];
-            dfloat hinv = mesh.sgeo[base+IHID];
+            dfloat nx = mesh.sgeo[base+mesh.NXID];
+            dfloat ny = mesh.sgeo[base+mesh.NYID];
+            dfloat nz = mesh.sgeo[base+mesh.NZID];
+            dfloat wsJ = mesh.sgeo[base+mesh.WSJID];
+            dfloat hinv = mesh.sgeo[base+mesh.IHID];
 
             // form negative trace terms in IPDG
             int idnM = n*mesh.Np+vidM;
@@ -1583,7 +1527,7 @@ void elliptic_t::BuildOperatorMatrixIpdgHex3D(parAlmond::parCOO& A){
               AnmP += -0.5*wsJ*penalty*lnM*lmP; // -((tau/h)*ln^-,lm^+)
             }
           }
-          if(fabs(AnmP)>tol){
+          if(std::abs(AnmP)>tol){
             //#pragma omp critical
             {
               // remote info
@@ -1595,7 +1539,7 @@ void elliptic_t::BuildOperatorMatrixIpdgHex3D(parAlmond::parCOO& A){
             }
           }
         }
-        if(fabs(Anm)>tol){
+        if(std::abs(Anm)>tol){
           //#pragma omp critical
           {
             // local block
@@ -1610,20 +1554,17 @@ void elliptic_t::BuildOperatorMatrixIpdgHex3D(parAlmond::parCOO& A){
   }
 
   // sort received non-zero entries by row block
-  std::sort(A.entries, A.entries+nnz,
-            [](const parAlmond::parCOO::nonZero_t& a,
-               const parAlmond::parCOO::nonZero_t& b) {
-              if (a.row < b.row) return true;
-              if (a.row > b.row) return false;
+  sort(A.entries.ptr(), A.entries.ptr()+nnz,
+      [](const parAlmond::parCOO::nonZero_t& a,
+         const parAlmond::parCOO::nonZero_t& b) {
+        if (a.row < b.row) return true;
+        if (a.row > b.row) return false;
 
-              return a.col < b.col;
-            });
+        return a.col < b.col;
+      });
 
   //*A = (parAlmond::parCOO::nonZero_t*) realloc(*A, nnz*sizeof(parAlmond::parCOO::nonZero_t));
   A.nnz = nnz;
 
-  if(rankM==0) printf("done.\n");
-
-  free(globalIds);
-  free(B);  free(Br); free(Bs); free(Bt);
+  if(Comm::World().rank()==0) printf("done.\n");
 }
diff --git a/solvers/elliptic/src/ellipticOperator.cpp b/solvers/elliptic/src/ellipticOperator.cpp
index fb5567d1d..6fa4cbb5d 100644
--- a/solvers/elliptic/src/ellipticOperator.cpp
+++ b/solvers/elliptic/src/ellipticOperator.cpp
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -26,24 +26,25 @@
 
 #include "elliptic.hpp"
 
-void elliptic_t::Operator(occa::memory &o_q, occa::memory &o_Aq){
+void elliptic_t::Operator(deviceMemory<dfloat> &o_q, deviceMemory<dfloat> &o_Aq){
 
   if(disc_c0){
-    // int mapType = (mesh.elementType==HEXAHEDRA &&
+    // int mapType = (mesh.elementType==Mesh::HEXAHEDRA &&
     //                mesh.settings.compareSetting("ELEMENT MAP", "TRILINEAR")) ? 1:0;
 
-    // int integrationType = (mesh.elementType==HEXAHEDRA &&
+    // int integrationType = (mesh.elementType==Mesh::HEXAHEDRA &&
     //                        settings.compareSetting("ELLIPTIC INTEGRATION", "CUBATURE")) ? 1:0;
 
-    ogsMasked->GatheredHaloExchangeStart(o_q, 1, ogs_dfloat);
+    gHalo.ExchangeStart(o_q, 1);
 
-    if(mesh.NlocalGatherElements){
+    if(mesh.NlocalGatherElements/2){
       // if(integrationType==0) { // GLL or non-hex
         // if(mapType==0)
-          partialAxKernel(mesh.NlocalGatherElements,
+          partialAxKernel(mesh.NlocalGatherElements/2,
                           mesh.o_localGatherElementList,
-                          ogsMasked->o_GlobalToLocal,
-                          mesh.o_ggeo, mesh.o_D, mesh.o_S,
+                          o_GlobalToLocal,
+                          mesh.o_wJ, mesh.o_ggeo,
+                          mesh.o_D, mesh.o_S,
                           mesh.o_MM, lambda, o_q, o_AqL);
         /* NC: disabling until we re-add treatment of affine elements
         else
@@ -63,7 +64,7 @@ void elliptic_t::Operator(occa::memory &o_q, occa::memory &o_Aq){
     }
 
     // finalize halo exchange
-    ogsMasked->GatheredHaloExchangeFinish(o_q, 1, ogs_dfloat);
+    gHalo.ExchangeFinish(o_q, 1);
 
     if(mesh.NglobalGatherElements) {
 
@@ -71,8 +72,9 @@ void elliptic_t::Operator(occa::memory &o_q, occa::memory &o_Aq){
         // if(mapType==0)
           partialAxKernel(mesh.NglobalGatherElements,
                           mesh.o_globalGatherElementList,
-                          ogsMasked->o_GlobalToLocal,
-                          mesh.o_ggeo, mesh.o_D, mesh.o_S,
+                          o_GlobalToLocal,
+                          mesh.o_wJ, mesh.o_ggeo,
+                          mesh.o_D, mesh.o_S,
                           mesh.o_MM, lambda, o_q, o_AqL);
         /* NC: disabling until we re-add treatment of affine elements
         else
@@ -90,7 +92,18 @@ void elliptic_t::Operator(occa::memory &o_q, occa::memory &o_Aq){
     }
 
     //gather result to Aq
-    ogsMasked->Gather(o_Aq, o_AqL, ogs_dfloat, ogs_add, ogs_trans);
+    ogsMasked.GatherStart(o_Aq, o_AqL, 1, ogs::Add, ogs::Trans);
+
+    if((mesh.NlocalGatherElements+1)/2){
+      partialAxKernel((mesh.NlocalGatherElements+1)/2,
+                      mesh.o_localGatherElementList+(mesh.NlocalGatherElements/2),
+                      o_GlobalToLocal,
+                      mesh.o_wJ, mesh.o_ggeo,
+                      mesh.o_D, mesh.o_S,
+                      mesh.o_MM, lambda, o_q, o_AqL);
+    }
+
+    ogsMasked.GatherFinish(o_Aq, o_AqL, 1, ogs::Add, ogs::Trans);
 
   } else if(disc_ipdg) {
 
@@ -105,7 +118,7 @@ void elliptic_t::Operator(occa::memory &o_q, occa::memory &o_Aq){
     }
 
     // dfloat4 storage -> 4 entries
-    traceHalo->ExchangeStart(o_grad, 4, ogs_dfloat);
+    traceHalo.ExchangeStart(o_grad, 4);
 
     if(mesh.NinternalElements)
       partialIpdgKernel(mesh.NinternalElements,
@@ -123,7 +136,7 @@ void elliptic_t::Operator(occa::memory &o_q, occa::memory &o_Aq){
                         o_grad,
                         o_Aq);
 
-    traceHalo->ExchangeFinish(o_grad, 4, ogs_dfloat);
+    traceHalo.ExchangeFinish(o_grad, 4);
 
     if(mesh.NhaloElements) {
       partialIpdgKernel(mesh.NhaloElements,
diff --git a/solvers/elliptic/src/ellipticPlotFields.cpp b/solvers/elliptic/src/ellipticPlotFields.cpp
index ea3437167..8df1af8ba 100644
--- a/solvers/elliptic/src/ellipticPlotFields.cpp
+++ b/solvers/elliptic/src/ellipticPlotFields.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,11 +27,11 @@ SOFTWARE.
 #include "elliptic.hpp"
 
 // interpolate data to plot nodes and save to file (one per process
-void elliptic_t::PlotFields(dfloat* Q, char *fileName){
+void elliptic_t::PlotFields(memory<dfloat>& Q, std::string fileName){
 
   FILE *fp;
 
-  fp = fopen(fileName, "w");
+  fp = fopen(fileName.c_str(), "w");
 
   fprintf(fp, "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"BigEndian\">\n");
   fprintf(fp, "  <UnstructuredGrid>\n");
@@ -44,30 +44,36 @@ void elliptic_t::PlotFields(dfloat* Q, char *fileName){
   fprintf(fp, "        <DataArray type=\"Float32\" NumberOfComponents=\"3\" Format=\"ascii\">\n");
 
   //scratch space for interpolation
-  size_t NscratchBytes = mymax(mesh.Np, mesh.plotNp)*sizeof(dfloat);
-  dfloat* scratch = (dfloat *) malloc(2*NscratchBytes);
+  size_t Nscratch = std::max(mesh.Np, mesh.plotNp);
+  memory<dfloat> scratch(2*Nscratch);
 
-  dfloat* Ix = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iy = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iz = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
+  memory<dfloat> Ix(mesh.plotNp);
+  memory<dfloat> Iy(mesh.plotNp);
+  memory<dfloat> Iz(mesh.plotNp);
 
   // compute plot node coordinates on the fly
   for(dlong e=0;e<mesh.Nelements;++e){
     mesh.PlotInterp(mesh.x + e*mesh.Np, Ix, scratch);
     mesh.PlotInterp(mesh.y + e*mesh.Np, Iy, scratch);
-    mesh.PlotInterp(mesh.z + e*mesh.Np, Iz, scratch);
+    if(mesh.dim==3)
+      mesh.PlotInterp(mesh.z + e*mesh.Np, Iz, scratch);
 
-    for(int n=0;n<mesh.plotNp;++n){
-      fprintf(fp, "       ");
-      fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],Iz[n]);
+    if (mesh.dim==2) {
+      for(int n=0;n<mesh.plotNp;++n){
+        fprintf(fp, "       ");
+        fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],0.0);
+      }
+    } else {
+      for(int n=0;n<mesh.plotNp;++n){
+        fprintf(fp, "       ");
+        fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],Iz[n]);
+      }
     }
   }
   fprintf(fp, "        </DataArray>\n");
   fprintf(fp, "      </Points>\n");
 
-  free(Ix); free(Iy); free(Iz);
-
-  dfloat* Iq = (dfloat *) malloc(mesh.plotNp*Nfields*sizeof(dfloat));
+  memory<dfloat> Iq(mesh.plotNp*Nfields);
 
   // write out fields
   fprintf(fp, "      <PointData Scalars=\"scalars\">\n");
@@ -86,8 +92,6 @@ void elliptic_t::PlotFields(dfloat* Q, char *fileName){
   fprintf(fp, "       </DataArray>\n");
   fprintf(fp, "     </PointData>\n");
 
-  free(Iq);
-
   fprintf(fp, "    <Cells>\n");
   fprintf(fp, "      <DataArray type=\"Int32\" Name=\"connectivity\" Format=\"ascii\">\n");
 
@@ -128,6 +132,4 @@ void elliptic_t::PlotFields(dfloat* Q, char *fileName){
   fprintf(fp, "  </UnstructuredGrid>\n");
   fprintf(fp, "</VTKFile>\n");
   fclose(fp);
-
-  free(scratch);
 }
diff --git a/solvers/elliptic/src/ellipticPreconJacobi.cpp b/solvers/elliptic/src/ellipticPreconJacobi.cpp
index 0dc1694ec..805222ed1 100644
--- a/solvers/elliptic/src/ellipticPreconJacobi.cpp
+++ b/solvers/elliptic/src/ellipticPreconJacobi.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -30,23 +30,22 @@ SOFTWARE.
 JacobiPrecon::JacobiPrecon(elliptic_t& _elliptic):
   elliptic(_elliptic) {
 
-  dfloat *diagA    = (dfloat*) calloc(elliptic.Ndofs, sizeof(dfloat));
-  dfloat *invDiagA = (dfloat*) calloc(elliptic.Ndofs, sizeof(dfloat));
+  memory<dfloat> diagA   (elliptic.Ndofs);
+  memory<dfloat> invDiagA(elliptic.Ndofs);
   elliptic.BuildOperatorDiagonal(diagA);
   for (dlong n=0;n<elliptic.Ndofs;n++)
     invDiagA[n] = 1.0/diagA[n];
 
-  o_invDiagA = elliptic.platform.malloc(elliptic.Ndofs*sizeof(dfloat), invDiagA);
-
-  free(diagA);
-  free(invDiagA);
+  o_invDiagA = elliptic.platform.malloc<dfloat>(invDiagA);
 }
 
-void JacobiPrecon::Operator(occa::memory& o_r, occa::memory& o_Mr) {
+void JacobiPrecon::Operator(deviceMemory<dfloat>& o_r, deviceMemory<dfloat>& o_Mr) {
+
+  linAlg_t& linAlg = elliptic.platform.linAlg();
 
   // Mr = invDiag.*r
-  elliptic.linAlg.amxpy(elliptic.Ndofs, 1.0, o_invDiagA, o_r, 0.0, o_Mr);
+  linAlg.amxpy(elliptic.Ndofs, 1.0, o_invDiagA, o_r, 0.0, o_Mr);
 
   // zero mean of RHS
   if(elliptic.allNeumann) elliptic.ZeroMean(o_Mr);
-}
\ No newline at end of file
+}
diff --git a/solvers/elliptic/src/ellipticPreconMassMatrix.cpp b/solvers/elliptic/src/ellipticPreconMassMatrix.cpp
index d425e8219..b05c2237e 100644
--- a/solvers/elliptic/src/ellipticPreconMassMatrix.cpp
+++ b/solvers/elliptic/src/ellipticPreconMassMatrix.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -31,67 +31,80 @@ MassMatrixPrecon::MassMatrixPrecon(elliptic_t& _elliptic):
   elliptic(_elliptic), mesh(_elliptic.mesh), settings(_elliptic.settings) {
 
   //sanity checking
-  if (mesh.elementType!=TRIANGLES && mesh.elementType!=TETRAHEDRA )
-    LIBP_ABORT(string("MASSMATRIX preconditioner is only available for triangle and tetrhedra elements. Use JACOBI instead."));
+  LIBP_ABORT("MASSMATRIX preconditioner is only available for triangle and tetrhedra elements. Use JACOBI instead.",
+             mesh.elementType!=Mesh::TRIANGLES && mesh.elementType!=Mesh::TETRAHEDRA);
 
-  if (elliptic.lambda==0)
-    LIBP_ABORT(string("MASSMATRIX preconditioner is unavailble when lambda=0."));
+  LIBP_ABORT("MASSMATRIX preconditioner is unavailble when lambda=0.",
+             elliptic.lambda==0);
 
-  o_invMM = elliptic.platform.malloc(mesh.Np*mesh.Np*sizeof(dfloat), mesh.invMM);
+  o_invMM = elliptic.platform.malloc<dfloat>(mesh.invMM);
 
   // OCCA build stuff
-  occa::properties kernelInfo = elliptic.mesh.props; //copy base occa properties
+  properties_t kernelInfo = mesh.props; //copy base occa properties
 
   int blockMax = 256;
   if (elliptic.platform.device.mode() == "CUDA") blockMax = 512;
 
-  int NblockV = mymax(1,blockMax/mesh.Np);
+  int NblockV = std::max(1,blockMax/mesh.Np);
   kernelInfo["defines/" "p_NblockV"]= NblockV;
 
   if (settings.compareSetting("DISCRETIZATION", "IPDG")) {
     blockJacobiKernel = elliptic.platform.buildKernel(DELLIPTIC "/okl/ellipticPreconBlockJacobi.okl",
                                      "blockJacobi", kernelInfo);
   } else if (settings.compareSetting("DISCRETIZATION", "CONTINUOUS")) {
-    dlong Ntotal = elliptic.ogsMasked->Ngather + elliptic.ogsMasked->NgatherHalo;
-    o_rtmp = elliptic.platform.malloc(Ntotal*sizeof(dfloat));
-    o_MrL  = elliptic.platform.malloc(mesh.Np*mesh.Nelements*sizeof(dfloat));
+    dlong Ntotal = elliptic.ogsMasked.Ngather + elliptic.gHalo.Nhalo;
+    o_rtmp = elliptic.platform.malloc<dfloat>(Ntotal);
+    o_MrL  = elliptic.platform.malloc<dfloat>(mesh.Np*mesh.Nelements);
 
     partialBlockJacobiKernel = elliptic.platform.buildKernel(DELLIPTIC "/okl/ellipticPreconBlockJacobi.okl",
                                      "partialBlockJacobi", kernelInfo);
   }
 }
 
-void MassMatrixPrecon::Operator(occa::memory& o_r, occa::memory& o_Mr) {
+void MassMatrixPrecon::Operator(deviceMemory<dfloat>& o_r, deviceMemory<dfloat>& o_Mr) {
   dfloat invLambda = 1./elliptic.lambda;
 
+  linAlg_t& linAlg = elliptic.platform.linAlg();
+
   if (elliptic.disc_c0) {//C0
 
     // rtmp = invDegree.*r
-    elliptic.linAlg.amxpy(elliptic.Ndofs, 1.0, elliptic.o_weightG, o_r, 0.0, o_rtmp);
+    linAlg.amxpy(elliptic.Ndofs, 1.0, elliptic.o_weightG, o_r, 0.0, o_rtmp);
 
-    elliptic.ogsMasked->GatheredHaloExchangeStart(o_rtmp, 1, ogs_dfloat);
+    elliptic.gHalo.ExchangeStart(o_rtmp, 1);
 
-    if(mesh.NlocalGatherElements)
-      partialBlockJacobiKernel(mesh.NlocalGatherElements,
+    if(mesh.NlocalGatherElements/2)
+      partialBlockJacobiKernel(mesh.NlocalGatherElements/2,
                                mesh.o_localGatherElementList,
-                               elliptic.ogsMasked->o_GlobalToLocal,
+                               elliptic.o_GlobalToLocal,
                                invLambda, mesh.o_vgeo, o_invMM,
                                o_rtmp, o_MrL);
 
-    elliptic.ogsMasked->GatheredHaloExchangeFinish(o_rtmp, 1, ogs_dfloat);
+    // finalize halo exchange
+    elliptic.gHalo.ExchangeFinish(o_rtmp, 1);
 
     if(mesh.NglobalGatherElements)
       partialBlockJacobiKernel(mesh.NglobalGatherElements,
                                mesh.o_globalGatherElementList,
-                               elliptic.ogsMasked->o_GlobalToLocal,
+                               elliptic.o_GlobalToLocal,
                                invLambda, mesh.o_vgeo, o_invMM,
                                o_rtmp, o_MrL);
 
     //gather result to Aq
-    elliptic.ogsMasked->Gather(o_Mr, o_MrL, ogs_dfloat, ogs_add, ogs_trans);
+    elliptic.ogsMasked.GatherStart(o_Mr, o_MrL, 1, ogs::Add, ogs::Trans);
+
+    if((mesh.NlocalGatherElements+1)/2){
+      partialBlockJacobiKernel((mesh.NlocalGatherElements+1)/2,
+                               mesh.o_localGatherElementList+mesh.NlocalGatherElements/2,
+                               elliptic.o_GlobalToLocal,
+                               invLambda, mesh.o_vgeo, o_invMM,
+                               o_rtmp, o_MrL);
+    }
+
+    elliptic.ogsMasked.GatherFinish(o_Mr, o_MrL, 1, ogs::Add, ogs::Trans);
 
     // Mr = invDegree.*Mr
-    elliptic.linAlg.amx(elliptic.Ndofs, 1.0, elliptic.o_weightG, o_Mr);
+    linAlg.amx(elliptic.Ndofs, 1.0, elliptic.o_weightG, o_Mr);
 
   } else {
     //IPDG
@@ -101,8 +114,3 @@ void MassMatrixPrecon::Operator(occa::memory& o_r, occa::memory& o_Mr) {
   // zero mean of RHS
   if(elliptic.allNeumann) elliptic.ZeroMean(o_Mr);
 }
-
-MassMatrixPrecon::~MassMatrixPrecon(){
-  blockJacobiKernel.free();
-  partialBlockJacobiKernel.free();
-}
\ No newline at end of file
diff --git a/solvers/elliptic/src/ellipticPreconMultiGrid.cpp b/solvers/elliptic/src/ellipticPreconMultiGrid.cpp
index a6b027df9..169e5c625 100644
--- a/solvers/elliptic/src/ellipticPreconMultiGrid.cpp
+++ b/solvers/elliptic/src/ellipticPreconMultiGrid.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -28,7 +28,7 @@ SOFTWARE.
 
 
 // Matrix-free p-Multigrid levels followed by AMG
-void MultiGridPrecon::Operator(occa::memory& o_r, occa::memory& o_Mr) {
+void MultiGridPrecon::Operator(deviceMemory<dfloat>& o_r, deviceMemory<dfloat>& o_Mr) {
 
   //just pass to parAlmond
   parAlmond.Operator(o_r, o_Mr);
@@ -46,37 +46,38 @@ MultiGridPrecon::MultiGridPrecon(elliptic_t& _elliptic):
   int NpFine   = mesh.Np;
   int NpCoarse = mesh.Np;
 
-  MGLevel* prevLevel=nullptr;
-  MGLevel* currLevel=nullptr;
-
   while(Nc>1) {
+    if (Comm::World().rank()==0){
+      printf("-----------------------------Multigrid pMG Degree %2d----------------------------------------\n", Nc);
+    }
     //build mesh and elliptic objects for this degree
-    mesh_t &meshF = mesh.SetupNewDegree(Nf);
-    elliptic_t &ellipticF = elliptic.SetupNewDegree(meshF);
+    mesh_t meshF = mesh.SetupNewDegree(Nf);
+    elliptic_t ellipticF = elliptic.SetupNewDegree(meshF);
 
     //share masking data with previous MG level
-    if (prevLevel) {
-      prevLevel->meshC = &meshF;
-      prevLevel->ogsMaskedC = ellipticF.ogsMasked;
+    if (parAlmond.NumLevels()>0) {
+      MGLevel& prevLevel = parAlmond.GetLevel<MGLevel>(parAlmond.NumLevels()-1);
+      prevLevel.meshC = meshF;
+      prevLevel.ellipticC = ellipticF;
     }
 
     //find the degree of the next level
     if (settings.compareSetting("MULTIGRID COARSENING","ALLDEGREES")) {
       Nc = Nf-1;
     } else if (settings.compareSetting("MULTIGRID COARSENING","HALFDEGREES")) {
-      Nc = mymax(1,(Nf+1)/2);
+      Nc = std::max(1,(Nf+1)/2);
     } else { //default "HALFDOFS"
       // pick the degrees so the dofs of each level halfs (roughly)
       while (NpCoarse > NpFine/2 && Nc>1) {
         Nc--;
         switch(mesh.elementType){
-          case TRIANGLES:
+          case Mesh::TRIANGLES:
             NpCoarse = ((Nc+1)*(Nc+2))/2; break;
-          case QUADRILATERALS:
+          case Mesh::QUADRILATERALS:
             NpCoarse = (Nc+1)*(Nc+1); break;
-          case TETRAHEDRA:
+          case Mesh::TETRAHEDRA:
             NpCoarse = ((Nc+1)*(Nc+2)*(Nc+3))/6; break;
-          case HEXAHEDRA:
+          case Mesh::HEXAHEDRA:
             NpCoarse = (Nc+1)*(Nc+1)*(Nc+1); break;
         }
       }
@@ -84,45 +85,50 @@ MultiGridPrecon::MultiGridPrecon(elliptic_t& _elliptic):
 
     //set Npcoarse
     switch(mesh.elementType){
-      case TRIANGLES:
+      case Mesh::TRIANGLES:
         NpCoarse = ((Nc+1)*(Nc+2))/2; break;
-      case QUADRILATERALS:
+      case Mesh::QUADRILATERALS:
         NpCoarse = (Nc+1)*(Nc+1); break;
-      case TETRAHEDRA:
+      case Mesh::TETRAHEDRA:
         NpCoarse = ((Nc+1)*(Nc+2)*(Nc+3))/6; break;
-      case HEXAHEDRA:
+      case Mesh::HEXAHEDRA:
         NpCoarse = (Nc+1)*(Nc+1)*(Nc+1); break;
     }
 
     dlong Nrows, Ncols;
     if (settings.compareSetting("DISCRETIZATION", "CONTINUOUS")) {
-      Nrows = ellipticF.ogsMasked->Ngather;
-      Ncols = Nrows + ellipticF.ogsMasked->NgatherHalo;
+      Nrows = ellipticF.ogsMasked.Ngather;
+      Ncols = Nrows + ellipticF.gHalo.Nhalo;
     } else {
       Nrows = meshF.Nelements*meshF.Np;
       Ncols = Nrows + meshF.totalHaloPairs*mesh.Np;
     }
 
-    //make a multigrid level
-    currLevel = new MGLevel(ellipticF, Nrows, Ncols, Nc, NpCoarse);
-    parAlmond.AddLevel(currLevel);
+    //Add a multigrid level
+    parAlmond.AddLevel<MGLevel>(ellipticF, Nrows, Ncols, Nc, NpCoarse);
 
     Nf = Nc;
     NpFine = NpCoarse;
-    prevLevel = currLevel;
   }
 
   //build matrix at degree 1
-  mesh_t &meshF = mesh.SetupNewDegree(1);
-  elliptic_t &ellipticF = elliptic.SetupNewDegree(meshF);
+  if (Comm::World().rank()==0){
+    printf("-----------------------------Multigrid pMG Degree  1----------------------------------------\n");
+  }
+  mesh_t meshF = mesh.SetupNewDegree(1);
+  elliptic_t ellipticF = elliptic.SetupNewDegree(meshF);
 
   //share masking data with previous MG level
-  if (prevLevel) {
-    prevLevel->meshC = &meshF;
-    prevLevel->ogsMaskedC = ellipticF.ogsMasked;
+  if (parAlmond.NumLevels()>0) {
+    MGLevel& prevLevel = parAlmond.GetLevel<MGLevel>(parAlmond.NumLevels()-1);
+    prevLevel.meshC = meshF;
+    prevLevel.ellipticC = ellipticF;
   }
 
   //build full A matrix and pass to parAlmond
+  if (Comm::World().rank()==0){
+    printf("-----------------------------Multigrid AMG Setup--------------------------------------------\n");
+  }
   parAlmond::parCOO A(elliptic.platform, mesh.comm);
   if (settings.compareSetting("DISCRETIZATION", "IPDG"))
     ellipticF.BuildOperatorMatrixIpdg(A);
@@ -133,13 +139,15 @@ MultiGridPrecon::MultiGridPrecon(elliptic_t& _elliptic):
   int rank = mesh.rank;
   int size = mesh.size;
   hlong TotalRows = A.globalRowStarts[size];
-  dlong numLocalRows = (dlong) (A.globalRowStarts[rank+1]-A.globalRowStarts[rank]);
-  dfloat *null = (dfloat *) malloc(numLocalRows*sizeof(dfloat));
-  for (dlong i=0;i<numLocalRows;i++) null[i] = 1.0/sqrt(TotalRows);
+  dlong numLocalRows = static_cast<dlong>(A.globalRowStarts[rank+1]-A.globalRowStarts[rank]);
+
+  memory<dfloat> null(numLocalRows);
+  for (dlong i=0;i<numLocalRows;i++) {
+    null[i] = 1.0/sqrt(TotalRows);
+  }
 
   //set up AMG levels (treating the N=1 level as a matrix level)
   parAlmond.AMGSetup(A, elliptic.allNeumann, null, elliptic.allNeumannPenalty);
-  free(null);
 
   //report
   parAlmond.Report();
diff --git a/solvers/elliptic/src/ellipticPreconMultiGridLevel.cpp b/solvers/elliptic/src/ellipticPreconMultiGridLevel.cpp
index 85ffe6d18..5a8db8cf0 100644
--- a/solvers/elliptic/src/ellipticPreconMultiGridLevel.cpp
+++ b/solvers/elliptic/src/ellipticPreconMultiGridLevel.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,74 +27,95 @@ SOFTWARE.
 #include "elliptic.hpp"
 #include "ellipticPrecon.hpp"
 
-void MGLevel::Operator(occa::memory &o_X, occa::memory &o_Ax) {
+void MGLevel::Operator(deviceMemory<dfloat>& o_X, deviceMemory<dfloat>& o_Ax) {
   elliptic.Operator(o_X,o_Ax);
 }
 
-void MGLevel::residual(occa::memory &o_RHS, occa::memory &o_X, occa::memory &o_RES) {
+void MGLevel::residual(deviceMemory<dfloat>& o_RHS, deviceMemory<dfloat>& o_X, deviceMemory<dfloat>& o_RES) {
   elliptic.Operator(o_X,o_RES);
 
   // subtract res = rhs - A*x
-  linAlg.axpy(elliptic.Ndofs, 1.f, o_RHS, -1.f, o_RES);
+  platform.linAlg().axpy(elliptic.Ndofs, 1.f, o_RHS, -1.f, o_RES);
 }
 
-void MGLevel::coarsen(occa::memory &o_X, occa::memory &o_Rx) {
+void MGLevel::coarsen(deviceMemory<dfloat>& o_X, deviceMemory<dfloat>& o_Rx) {
+
+  linAlg_t& linAlg = platform.linAlg();
 
   if (elliptic.disc_c0) {
     //scratch spaces
-    occa::memory &o_wx = o_smootherResidual;
-    occa::memory &o_RxL = o_transferScratch;
+    deviceMemory<dfloat>& o_wx = o_smootherResidual;
+    deviceMemory<dfloat>& o_RxL = o_transferScratch;
 
     //pre-weight
     linAlg.amxpy(elliptic.Ndofs, 1.0, elliptic.o_weightG, o_X, 0.0, o_wx);
 
-    elliptic.ogsMasked->GatheredHaloExchangeStart(o_wx, 1, ogs_dfloat);
+    elliptic.gHalo.ExchangeStart(o_wx, 1);
 
-    if(mesh.NlocalGatherElements)
-      partialCoarsenKernel(mesh.NlocalGatherElements,
+    if(mesh.NlocalGatherElements/2)
+      partialCoarsenKernel(mesh.NlocalGatherElements/2,
                            mesh.o_localGatherElementList,
-                           elliptic.ogsMasked->o_GlobalToLocal,
+                           elliptic.o_GlobalToLocal,
                            o_P, o_wx, o_RxL);
 
-    elliptic.ogsMasked->GatheredHaloExchangeFinish(o_wx, 1, ogs_dfloat);
+    elliptic.gHalo.ExchangeFinish(o_wx, 1);
 
     if(mesh.NglobalGatherElements)
       partialCoarsenKernel(mesh.NglobalGatherElements,
                            mesh.o_globalGatherElementList,
-                           elliptic.ogsMasked->o_GlobalToLocal,
+                           elliptic.o_GlobalToLocal,
+                           o_P, o_wx, o_RxL);
+
+    ellipticC.ogsMasked.GatherStart(o_Rx, o_RxL, 1, ogs::Add, ogs::Trans);
+
+    if((mesh.NlocalGatherElements+1)/2)
+      partialCoarsenKernel((mesh.NlocalGatherElements+1)/2,
+                           mesh.o_localGatherElementList + mesh.NlocalGatherElements/2,
+                           elliptic.o_GlobalToLocal,
                            o_P, o_wx, o_RxL);
 
-    ogsMaskedC->Gather(o_Rx, o_RxL, ogs_dfloat, ogs_add, ogs_trans);
+    ellipticC.ogsMasked.GatherFinish(o_Rx, o_RxL, 1, ogs::Add, ogs::Trans);
 
   } else {
     coarsenKernel(mesh.Nelements, o_P, o_X, o_Rx);
   }
 }
 
-void MGLevel::prolongate(occa::memory &o_X, occa::memory &o_Px) {
+void MGLevel::prolongate(deviceMemory<dfloat>& o_X, deviceMemory<dfloat>& o_Px) {
+
+  linAlg_t& linAlg = platform.linAlg();
+
   if (elliptic.disc_c0) {
     //scratch spaces
-    occa::memory &o_PxG = o_smootherResidual;
-    occa::memory &o_PxL = o_transferScratch;
+    deviceMemory<dfloat>& o_PxG = o_smootherResidual;
+    deviceMemory<dfloat>& o_PxL = o_transferScratch;
 
-    ogsMaskedC->GatheredHaloExchangeStart(o_X, 1, ogs_dfloat);
+    ellipticC.gHalo.ExchangeStart(o_X, 1);
 
-    if(mesh.NlocalGatherElements)
-      partialProlongateKernel(meshC->NlocalGatherElements,
-                              meshC->o_localGatherElementList,
-                              ogsMaskedC->o_GlobalToLocal,
+    if(meshC.NlocalGatherElements/2)
+      partialProlongateKernel(meshC.NlocalGatherElements/2,
+                              meshC.o_localGatherElementList,
+                              ellipticC.o_GlobalToLocal,
                               o_P, o_X, o_PxL);
 
-    ogsMaskedC->GatheredHaloExchangeFinish(o_X, 1, ogs_dfloat);
+    ellipticC.gHalo.ExchangeFinish(o_X, 1);
 
-    if(mesh.NglobalGatherElements)
-      partialProlongateKernel(meshC->NglobalGatherElements,
-                              meshC->o_globalGatherElementList,
-                              ogsMaskedC->o_GlobalToLocal,
+    if(meshC.NglobalGatherElements)
+      partialProlongateKernel(meshC.NglobalGatherElements,
+                              meshC.o_globalGatherElementList,
+                              ellipticC.o_GlobalToLocal,
                               o_P, o_X, o_PxL);
 
     //ogs_notrans -> no summation at repeated nodes, just one value
-    elliptic.ogsMasked->Gather(o_PxG, o_PxL, ogs_dfloat, ogs_add, ogs_notrans);
+    elliptic.ogsMasked.GatherStart(o_PxG, o_PxL, 1, ogs::Add, ogs::NoTrans);
+
+    if((meshC.NlocalGatherElements+1)/2)
+      partialProlongateKernel((meshC.NlocalGatherElements+1)/2,
+                              meshC.o_localGatherElementList + meshC.NlocalGatherElements/2,
+                              ellipticC.o_GlobalToLocal,
+                              o_P, o_X, o_PxL);
+
+    elliptic.ogsMasked.GatherFinish(o_PxG, o_PxL, 1, ogs::Add, ogs::NoTrans);
 
     linAlg.axpy(elliptic.Ndofs, 1.f, o_PxG, 1.f, o_Px);
 
@@ -103,7 +124,7 @@ void MGLevel::prolongate(occa::memory &o_X, occa::memory &o_Px) {
   }
 }
 
-void MGLevel::smooth(occa::memory &o_RHS, occa::memory &o_X, bool x_is_zero) {
+void MGLevel::smooth(deviceMemory<dfloat>& o_RHS, deviceMemory<dfloat>& o_X, bool x_is_zero) {
   if (stype==JACOBI) {
     smoothJacobi(o_RHS, o_X, x_is_zero);
   } else if (stype==CHEBYSHEV) {
@@ -111,9 +132,11 @@ void MGLevel::smooth(occa::memory &o_RHS, occa::memory &o_X, bool x_is_zero) {
   }
 }
 
-void MGLevel::smoothJacobi(occa::memory &o_r, occa::memory &o_X, bool xIsZero) {
+void MGLevel::smoothJacobi(deviceMemory<dfloat>& o_r, deviceMemory<dfloat>& o_X, bool xIsZero) {
 
-  occa::memory &o_RES = o_smootherResidual;
+  linAlg_t& linAlg = platform.linAlg();
+
+  deviceMemory<dfloat>& o_RES = o_smootherResidual;
 
   if (xIsZero) {
     linAlg.amxpy(elliptic.Ndofs, 1.0, o_invDiagA, o_r, 0.0, o_X);
@@ -128,7 +151,7 @@ void MGLevel::smoothJacobi(occa::memory &o_r, occa::memory &o_X, bool xIsZero) {
   linAlg.amxpy(elliptic.Ndofs, 1.0, o_invDiagA, o_RES, 1.0, o_X);
 }
 
-void MGLevel::smoothChebyshev (occa::memory &o_r, occa::memory &o_X, bool xIsZero) {
+void MGLevel::smoothChebyshev (deviceMemory<dfloat>& o_r, deviceMemory<dfloat>& o_X, bool xIsZero) {
 
   const dfloat theta = 0.5*(lambda1+lambda0);
   const dfloat delta = 0.5*(lambda1-lambda0);
@@ -137,9 +160,11 @@ void MGLevel::smoothChebyshev (occa::memory &o_r, occa::memory &o_X, bool xIsZer
   dfloat rho_n = 1./sigma;
   dfloat rho_np1;
 
-  occa::memory &o_RES = o_smootherResidual;
-  occa::memory &o_Ad  = o_smootherResidual2;
-  occa::memory &o_d   = o_smootherUpdate;
+  deviceMemory<dfloat>& o_RES = o_smootherResidual;
+  deviceMemory<dfloat>& o_Ad  = o_smootherResidual2;
+  deviceMemory<dfloat>& o_d   = o_smootherUpdate;
+
+  linAlg_t& linAlg = platform.linAlg();
 
   if(xIsZero){ //skip the Ax if x is zero
     //res = S*r
@@ -187,56 +212,56 @@ void MGLevel::smoothChebyshev (occa::memory &o_r, occa::memory &o_X, bool xIsZer
 *
 *******************************************/
 
-size_t  MGLevel::smootherResidualBytes=0;
-size_t  MGLevel::scratchBytes=0;
-dfloat* MGLevel::smootherResidual=nullptr;
-occa::memory MGLevel::o_smootherResidual;
-occa::memory MGLevel::o_smootherResidual2;
-occa::memory MGLevel::o_smootherUpdate;
-occa::memory MGLevel::o_transferScratch;
+dlong  MGLevel::NsmootherResidual=0;
+dlong  MGLevel::Nscratch=0;
+memory<dfloat> MGLevel::smootherResidual;
+deviceMemory<dfloat> MGLevel::o_smootherResidual;
+deviceMemory<dfloat> MGLevel::o_smootherResidual2;
+deviceMemory<dfloat> MGLevel::o_smootherUpdate;
+deviceMemory<dfloat> MGLevel::o_transferScratch;
 
 //build a level and connect it to the next one
 MGLevel::MGLevel(elliptic_t& _elliptic,
                  dlong _Nrows, dlong _Ncols,
                  int Nc, int NpCoarse):
   multigridLevel(_Nrows, _Ncols,
-                 _elliptic.platform, _elliptic.settings),
+                 _elliptic.platform,
+                 _elliptic.settings,
+                 _elliptic.comm),
   elliptic(_elliptic),
-  mesh(_elliptic.mesh),
-  linAlg(_elliptic.linAlg) {
+  mesh(_elliptic.mesh) {
 
   SetupSmoother();
   AllocateStorage();
 
-  if (mesh.elementType==QUADRILATERALS || mesh.elementType==HEXAHEDRA) {
-    P = (dfloat *) calloc((mesh.N+1)*(Nc+1),sizeof(dfloat));
+  if (   mesh.elementType==Mesh::QUADRILATERALS
+      || mesh.elementType==Mesh::HEXAHEDRA) {
     mesh.DegreeRaiseMatrix1D(Nc, mesh.N, P);
-    o_P = elliptic.platform.malloc((mesh.N+1)*(Nc+1)*sizeof(dfloat), P);
-  } else if (mesh.elementType==TRIANGLES) {
-    P = (dfloat *) calloc(mesh.Np*NpCoarse,sizeof(dfloat));
+  } else if (mesh.elementType==Mesh::TRIANGLES) {
     mesh.DegreeRaiseMatrixTri2D(Nc, mesh.N, P);
-    o_P = elliptic.platform.malloc(mesh.Np*NpCoarse*sizeof(dfloat), P);
-  } else {
-    P = (dfloat *) calloc(mesh.Np*NpCoarse,sizeof(dfloat));
+  } else { //Mesh::TETRAHEDRA
     mesh.DegreeRaiseMatrixTet3D(Nc, mesh.N, P);
-    o_P = elliptic.platform.malloc(mesh.Np*NpCoarse*sizeof(dfloat), P);
   }
+  o_P = elliptic.platform.malloc<dfloat>(P);
 
   //build kernels
-  occa::properties kernelInfo = elliptic.platform.props;
+  properties_t kernelInfo = elliptic.platform.props();
 
   // set kernel name suffix
-  char *suffix;
-  if(mesh.elementType==TRIANGLES)
-    suffix = strdup("Tri2D");
-  else if(mesh.elementType==QUADRILATERALS)
-    suffix = strdup("Quad2D");
-  else if(mesh.elementType==TETRAHEDRA)
-    suffix = strdup("Tet3D");
-  else if(mesh.elementType==HEXAHEDRA)
-    suffix = strdup("Hex3D");
-
-  char fileName[BUFSIZ], kernelName[BUFSIZ];
+  std::string suffix;
+  if(mesh.elementType==Mesh::TRIANGLES)
+    suffix = "Tri2D";
+  else if(mesh.elementType==Mesh::QUADRILATERALS)
+    suffix = "Quad2D";
+  else if(mesh.elementType==Mesh::TETRAHEDRA)
+    suffix = "Tet3D";
+  else if(mesh.elementType==Mesh::HEXAHEDRA)
+    suffix = "Hex3D";
+
+  std::string oklFilePrefix = DELLIPTIC "/okl/";
+  std::string oklFileSuffix = ".okl";
+
+  std::string fileName, kernelName;
 
   kernelInfo["defines/" "p_NqFine"]= mesh.N+1;
   kernelInfo["defines/" "p_NqCoarse"]= Nc+1;
@@ -247,74 +272,60 @@ MGLevel::MGLevel(elliptic_t& _elliptic,
   int blockMax = 256;
   if (elliptic.platform.device.mode() == "CUDA") blockMax = 512;
 
-  int NblockVFine = mymax(1,blockMax/mesh.Np);
-  int NblockVCoarse = mymax(1,blockMax/NpCoarse);
+  int NblockVFine = std::max(1,blockMax/mesh.Np);
+  int NblockVCoarse = std::max(1,blockMax/NpCoarse);
   kernelInfo["defines/" "p_NblockVFine"]= NblockVFine;
   kernelInfo["defines/" "p_NblockVCoarse"]= NblockVCoarse;
 
   if (settings.compareSetting("DISCRETIZATION", "CONTINUOUS")) {
-    sprintf(fileName, DELLIPTIC "/okl/ellipticPreconCoarsen%s.okl", suffix);
-    sprintf(kernelName, "ellipticPartialPreconCoarsen%s", suffix);
+    fileName   = oklFilePrefix + "ellipticPreconCoarsen" + suffix + oklFileSuffix;
+    kernelName = "ellipticPartialPreconCoarsen" + suffix;
     partialCoarsenKernel = elliptic.platform.buildKernel(fileName, kernelName, kernelInfo);
 
-    sprintf(fileName, DELLIPTIC "/okl/ellipticPreconProlongate%s.okl", suffix);
-    sprintf(kernelName, "ellipticPartialPreconProlongate%s", suffix);
+    fileName   = oklFilePrefix + "ellipticPreconProlongate" + suffix + oklFileSuffix;
+    kernelName = "ellipticPartialPreconProlongate" + suffix;
     partialProlongateKernel = elliptic.platform.buildKernel(fileName, kernelName, kernelInfo);
   } else { //IPDG
-    sprintf(fileName, DELLIPTIC "/okl/ellipticPreconCoarsen%s.okl", suffix);
-    sprintf(kernelName, "ellipticPreconCoarsen%s", suffix);
+    fileName   = oklFilePrefix + "ellipticPreconCoarsen" + suffix + oklFileSuffix;
+    kernelName = "ellipticPreconCoarsen" + suffix;
     coarsenKernel = elliptic.platform.buildKernel(fileName, kernelName, kernelInfo);
 
-    sprintf(fileName, DELLIPTIC "/okl/ellipticPreconProlongate%s.okl", suffix);
-    sprintf(kernelName, "ellipticPreconProlongate%s", suffix);
+    fileName   = oklFilePrefix + "ellipticPreconProlongate" + suffix + oklFileSuffix;
+    kernelName = "ellipticPreconProlongate" + suffix;
     prolongateKernel = elliptic.platform.buildKernel(fileName, kernelName, kernelInfo);
   }
 }
 
 void MGLevel::AllocateStorage() {
   // extra storage for smoothing op
-  size_t Nbytes = Ncols*sizeof(dfloat);
-  if (smootherResidualBytes < Nbytes) {
-    if (o_smootherResidual.size()) {
-      free(smootherResidual);
-      o_smootherResidual.free();
-      o_smootherResidual2.free();
-      o_smootherUpdate.free();
-    }
-
-    smootherResidual = (dfloat *) calloc(Ncols,sizeof(dfloat));
-    o_smootherResidual = elliptic.platform.malloc(Nbytes,smootherResidual);
-    o_smootherResidual2 = elliptic.platform.malloc(Nbytes,smootherResidual);
-    o_smootherUpdate = elliptic.platform.malloc(Nbytes,smootherResidual);
-    smootherResidualBytes = Nbytes;
+  if (NsmootherResidual < Ncols) {
+    smootherResidual.malloc(Ncols, 0);
+    o_smootherResidual  = elliptic.platform.malloc<dfloat>(smootherResidual);
+    o_smootherResidual2 = elliptic.platform.malloc<dfloat>(smootherResidual);
+    o_smootherUpdate    = elliptic.platform.malloc<dfloat>(smootherResidual);
+    NsmootherResidual = Ncols;
   }
 
-  Nbytes = mesh.Nelements*mesh.Np*sizeof(dfloat);
-  if (scratchBytes < Nbytes) {
-    if (o_transferScratch.size()) {
-      o_transferScratch.free();
-    }
-    dfloat *dummy = (dfloat *) calloc(mesh.Nelements*mesh.Np,sizeof(dfloat));
-    o_transferScratch = elliptic.platform.malloc(Nbytes, dummy);
-    free(dummy);
-    scratchBytes = Nbytes;
+  if (Nscratch < mesh.Nelements*mesh.Np) {
+    memory<dfloat> dummy(mesh.Nelements*mesh.Np,0);
+    o_transferScratch = elliptic.platform.malloc<dfloat>(dummy);
+    Nscratch = mesh.Nelements*mesh.Np;
   }
 }
 
 void MGLevel::Report() {
 
-  hlong hNrows = (hlong) Nrows;
-
-  dlong minNrows=0, maxNrows=0;
-  hlong totalNrows=0;
-  dfloat avgNrows;
+  int totalActive=(Nrows>0) ? 1:0;
+  mesh.comm.Allreduce(totalActive);
 
-  MPI_Allreduce(&Nrows, &maxNrows, 1, MPI_DLONG, MPI_MAX, mesh.comm);
-  MPI_Allreduce(&hNrows, &totalNrows, 1, MPI_HLONG, MPI_SUM, mesh.comm);
-  avgNrows = (dfloat) totalNrows/mesh.size;
+  dlong minNrows=Nrows, maxNrows=Nrows;
+  hlong totalNrows=Nrows;
+  mesh.comm.Allreduce(maxNrows, Comm::Max);
+  mesh.comm.Allreduce(totalNrows, Comm::Sum);
+  dfloat avgNrows = static_cast<dfloat>(totalNrows)/totalActive;
 
   if (Nrows==0) Nrows=maxNrows; //set this so it's ignored for the global min
-  MPI_Allreduce(&Nrows, &minNrows, 1, MPI_DLONG, MPI_MIN, mesh.comm);
+  mesh.comm.Allreduce(minNrows, Comm::Min);
 
   char smootherString[BUFSIZ];
   if (stype==JACOBI)
@@ -331,24 +342,18 @@ void MGLevel::Report() {
   }
 }
 
-MGLevel::~MGLevel() {
-  coarsenKernel.free();
-  partialCoarsenKernel.free();
-  prolongateKernel.free();
-  partialProlongateKernel.free();
-}
-
 void MGLevel::SetupSmoother() {
 
   //set up the fine problem smoothing
-  dfloat *diagA    = (dfloat*) calloc(Nrows, sizeof(dfloat));
-  dfloat *invDiagA = (dfloat*) calloc(Nrows, sizeof(dfloat));
+  memory<dfloat> diagA   (Nrows);
+  memory<dfloat> invDiagA(Nrows);
   elliptic.BuildOperatorDiagonal(diagA);
 
-  for (dlong n=0;n<Nrows;n++)
+  for (dlong n=0;n<Nrows;n++) {
     invDiagA[n] = 1.0/diagA[n];
+  }
 
-  o_invDiagA = elliptic.platform.malloc(Nrows*sizeof(dfloat), invDiagA);
+  o_invDiagA = elliptic.platform.malloc<dfloat>(invDiagA);
 
   if (elliptic.settings.compareSetting("MULTIGRID SMOOTHER","CHEBYSHEV")) {
     stype = CHEBYSHEV;
@@ -376,8 +381,6 @@ void MGLevel::SetupSmoother() {
     //update diagonal with weight
     o_invDiagA.copyFrom(invDiagA);
   }
-  free(diagA);
-  free(invDiagA);
 }
 
 
@@ -392,28 +395,28 @@ dfloat MGLevel::maxEigSmoothAx(){
   const dlong N = Nrows;
   const dlong M = Ncols;
 
+  linAlg_t& linAlg = platform.linAlg();
+
   int k = 10;
 
-  hlong Nlocal = (hlong) Nrows;
-  hlong Ntotal = 0;
-  MPI_Allreduce(&Nlocal, &Ntotal, 1, MPI_HLONG, MPI_SUM, mesh.comm);
+  hlong Ntotal = Nrows;
+  mesh.comm.Allreduce(Ntotal);
   if(k > Ntotal) k = static_cast<int>(Ntotal);
 
   // do an arnoldi
 
   // allocate memory for Hessenberg matrix
-  double *H = (double *) calloc(k*k,sizeof(double));
+  memory<double> H(k*k,0.0);
 
   // allocate memory for basis
-  dfloat *Vx = (dfloat*) calloc(M, sizeof(dfloat));
-  //  occa::memory *o_V = (occa::memory *) calloc(k+1, sizeof(occa::memory));
-  occa::memory *o_V = new occa::memory[k+1];
+  memory<dfloat> Vx(M);
+  memory<deviceMemory<dfloat>> o_V(k+1);
 
-  occa::memory o_Vx  = elliptic.platform.malloc(M*sizeof(dfloat),Vx);
-  occa::memory o_AVx = elliptic.platform.malloc(M*sizeof(dfloat),Vx);
+  deviceMemory<dfloat> o_Vx  = elliptic.platform.malloc<dfloat>(Vx);
+  deviceMemory<dfloat> o_AVx = elliptic.platform.malloc<dfloat>(Vx);
 
   for(int i=0; i<=k; i++)
-    o_V[i] = elliptic.platform.malloc(M*sizeof(dfloat),Vx);
+    o_V[i] = elliptic.platform.malloc<dfloat>(Vx);
 
   // generate a random vector for initial basis vector
   for (dlong i=0;i<N;i++) Vx[i] = (dfloat) drand48();
@@ -437,7 +440,7 @@ dfloat MGLevel::maxEigSmoothAx(){
       // v[j+1] = v[j+1] - hij*v[i]
       linAlg.axpy(N, -hij, o_V[i], 1.f, o_V[j+1]);
 
-      H[i + j*k] = (double) hij;
+      H[i + j*k] = static_cast<double>(hij);
     }
 
     if(j+1 < k){
@@ -445,14 +448,14 @@ dfloat MGLevel::maxEigSmoothAx(){
       dfloat norm_vj =  linAlg.norm2(N, o_V[j+1], mesh.comm);
       linAlg.scale(N, 1.0/norm_vj, o_V[j+1]);
 
-      H[j+1+ j*k] = (double) norm_vj;
+      H[j+1+ j*k] = static_cast<double>(norm_vj);
     }
   }
 
-  double *WR = (double *) malloc(k*sizeof(double));
-  double *WI = (double *) malloc(k*sizeof(double));
+  memory<double> WR(k);
+  memory<double> WI(k);
 
-  matrixEigenValues(k, H, WR, WI);
+  linAlg_t::matrixEigenValues(k, H, WR, WI);
 
   double rho = 0.;
 
@@ -464,17 +467,6 @@ dfloat MGLevel::maxEigSmoothAx(){
     }
   }
 
-  // free memory
-  free(H);
-  free(WR);
-  free(WI);
-
-  free(Vx);
-  o_Vx.free();
-  o_AVx.free();
-  for(int i=0; i<=k; i++) o_V[i].free();
-  delete[] o_V;
-
   // if((mesh.rank==0)) printf("weight = %g \n", rho);
 
   return rho;
diff --git a/solvers/elliptic/src/ellipticPreconOAS.cpp b/solvers/elliptic/src/ellipticPreconOAS.cpp
index 8854105c2..52a161950 100644
--- a/solvers/elliptic/src/ellipticPreconOAS.cpp
+++ b/solvers/elliptic/src/ellipticPreconOAS.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -30,7 +30,7 @@ SOFTWARE.
 //  entire local mesh + 1 ring overlap, solved with a local multigrid
 //  precon and coarse problem consisting of the global degree 1
 //  problem, solved with parAlmond
-void OASPrecon::Operator(occa::memory& o_r, occa::memory& o_Mr) {
+void OASPrecon::Operator(deviceMemory<dfloat>& o_r, deviceMemory<dfloat>& o_Mr) {
 
   if (mesh.N>1) {
     if (elliptic.disc_c0) {
@@ -40,45 +40,47 @@ void OASPrecon::Operator(occa::memory& o_r, occa::memory& o_Mr) {
       // to a single operation, but currently theres no easy way
       // as the ordering of globalDofs between the original mesh
       // partition and the ring mesh could be different
-      elliptic.ogsMasked->Scatter(o_rPatchL, o_r, ogs_dfloat, ogs_add, ogs_notrans);
-      mesh.ringHalo->Exchange(o_rPatchL, mesh.Np, ogs_dfloat);
-      ellipticPatch->ogsMasked->Gather(o_rPatch, o_rPatchL, ogs_dfloat, ogs_add, ogs_notrans);
+      elliptic.ogsMasked.Scatter(o_rPatchL, o_r, 1, ogs::NoTrans);
+      mesh.ringHalo.Exchange(o_rPatchL, mesh.Np);
+      ellipticPatch.ogsMasked.Gather(o_rPatch, o_rPatchL, 1, ogs::Add, ogs::NoTrans);
     } else {
-      o_rPatch.copyFrom(o_r, elliptic.Ndofs*sizeof(dfloat));
-      mesh.ringHalo->Exchange(o_rPatch, mesh.Np, ogs_dfloat);
+      o_rPatch.copyFrom(o_r, elliptic.Ndofs);
+      mesh.ringHalo.Exchange(o_rPatch, mesh.Np);
     }
 
     //Apply local patch precon
-    preconPatch->Operator(o_rPatch, o_zPatch);
+    preconPatch.Operator(o_rPatch, o_zPatch);
 
     //Coarsen problem to N=1 and pass to parAlmond
     // TODO: This is blocking due to H<->D transfers.
     //       Should modify precons so size=1 is non-blocking
-    level->coarsen(o_r, o_rC);
+    level.coarsen(o_r, o_rC);
 
     parAlmond.Operator(o_rC, o_zC);
 
+    linAlg_t& linAlg = elliptic.platform.linAlg();
+
     //Add contributions from all patches together
     if (elliptic.disc_c0) {
       dlong Ntotal=mesh.Nelements*mesh.Np;
 
-      ellipticPatch->ogsMasked->Scatter(o_zPatchL, o_zPatch, ogs_dfloat, ogs_add, ogs_notrans);
-      ogsMaskedRing->GatherScatter(o_zPatchL, ogs_dfloat, ogs_add, ogs_sym);
+      ellipticPatch.ogsMasked.Scatter(o_zPatchL, o_zPatch, 1, ogs::NoTrans);
+      ogsMaskedRing.GatherScatter(o_zPatchL, 1, ogs::Add, ogs::Sym);
 
       // Weight by overlap degree, zPatch = patchWeight*zPatch
-      elliptic.linAlg.amx(Ntotal, 1.0, o_patchWeight, o_zPatchL);
+      linAlg.amx(Ntotal, 1.0, o_patchWeight, o_zPatchL);
 
-      elliptic.ogsMasked->Gather(o_Mr, o_zPatchL, ogs_dfloat, ogs_add, ogs_notrans);
+      elliptic.ogsMasked.Gather(o_Mr, o_zPatchL, 1, ogs::Add, ogs::NoTrans);
 
     } else {
-      mesh.ringHalo->Combine(o_zPatch, mesh.Np, ogs_dfloat);
+      mesh.ringHalo.Combine(o_zPatch, mesh.Np);
 
       // Weight by overlap degree, Mr = patchWeight*zPatch
-      elliptic.linAlg.amxpy(elliptic.Ndofs, 1.0, o_patchWeight, o_zPatch, 0.0, o_Mr);
+      linAlg.amxpy(elliptic.Ndofs, 1.0, o_patchWeight, o_zPatch, 0.0, o_Mr);
     }
 
     // Add prologatated coarse solution
-    level->prolongate(o_zC, o_Mr);
+    level.prolongate(o_zC, o_Mr);
   } else {
     //if N=1 just call the coarse solver
     parAlmond.Operator(o_r, o_Mr);
@@ -94,78 +96,86 @@ OASPrecon::OASPrecon(elliptic_t& _elliptic):
 
   //build the one ring mesh
   if (mesh.N>1) {
+    if (Comm::World().rank()==0){
+      printf("-----------------------------Multigrid Degree %2d Patch--------------------------------------\n", mesh.N);
+    }
     meshPatch = mesh.SetupRingPatch();
-    ellipticPatch = elliptic.SetupRingPatch(*meshPatch);
-    preconPatch = new MultiGridPrecon(*ellipticPatch);
+    ellipticPatch = elliptic.SetupRingPatch(meshPatch);
+    preconPatch.Setup<MultiGridPrecon>(ellipticPatch);
 
     if (settings.compareSetting("DISCRETIZATION", "CONTINUOUS")) {
-      rPatchL = (dfloat*) calloc(mesh.Np*(mesh.Nelements+mesh.totalRingElements),sizeof(dfloat));
-      zPatchL = (dfloat*) calloc(mesh.Np*(mesh.Nelements+mesh.totalRingElements),sizeof(dfloat));
+      rPatchL.malloc(mesh.Np*(mesh.Nelements+mesh.totalRingElements),0.0);
+      zPatchL.malloc(mesh.Np*(mesh.Nelements+mesh.totalRingElements),0.0);
 
-      o_rPatchL = elliptic.platform.malloc(mesh.Np*(mesh.Nelements+mesh.totalRingElements)*sizeof(dfloat), rPatchL);
-      o_zPatchL = elliptic.platform.malloc(mesh.Np*(mesh.Nelements+mesh.totalRingElements)*sizeof(dfloat), zPatchL);
+      o_rPatchL = elliptic.platform.malloc<dfloat>(rPatchL);
+      o_zPatchL = elliptic.platform.malloc<dfloat>(zPatchL);
     }
 
-    rPatch  = (dfloat*) calloc(ellipticPatch->Ndofs,sizeof(dfloat));
-    zPatch  = (dfloat*) calloc(ellipticPatch->Ndofs,sizeof(dfloat));
-    o_rPatch = elliptic.platform.malloc(ellipticPatch->Ndofs*sizeof(dfloat), rPatch);
-    o_zPatch = elliptic.platform.malloc(ellipticPatch->Ndofs*sizeof(dfloat), zPatch);
+    rPatch.malloc(ellipticPatch.Ndofs,0.0);
+    zPatch.malloc(ellipticPatch.Ndofs,0.0);
+    o_rPatch = elliptic.platform.malloc<dfloat>(rPatch);
+    o_zPatch = elliptic.platform.malloc<dfloat>(zPatch);
 
     //compute patch overlap weighting
-    patchWeight = (dfloat*) malloc(meshPatch->Nelements*meshPatch->Np*sizeof(dfloat));
-    for (int i=0;i<meshPatch->Nelements*meshPatch->Np;i++)
+    patchWeight.malloc(meshPatch.Nelements*meshPatch.Np);
+    for (int i=0;i<meshPatch.Nelements*meshPatch.Np;i++)
       patchWeight[i] = 1.0;
 
     if (settings.compareSetting("DISCRETIZATION", "CONTINUOUS")) {
       //share the masked version of the global id numbering
-      hlong *maskedRingGlobalIds = (hlong *) calloc(meshPatch->Nelements*meshPatch->Np,sizeof(hlong));
-      memcpy(maskedRingGlobalIds, elliptic.maskedGlobalIds, mesh.Nelements*mesh.Np*sizeof(hlong));
-      mesh.ringHalo->Exchange(maskedRingGlobalIds, mesh.Np, ogs_hlong);
+      memory<hlong> maskedRingGlobalIds(meshPatch.Nelements*meshPatch.Np);
+      maskedRingGlobalIds.copyFrom(elliptic.maskedGlobalIds, mesh.Nelements*mesh.Np);
+      mesh.ringHalo.Exchange(maskedRingGlobalIds, mesh.Np);
 
       //mask ring
-      for (dlong n=0;n<ellipticPatch->Nmasked;n++)
-        maskedRingGlobalIds[ellipticPatch->maskIds[n]] = 0;
+      for (dlong n=0;n<ellipticPatch.Nmasked;n++)
+        maskedRingGlobalIds[ellipticPatch.maskIds[n]] = 0;
 
       //use the masked ids to make another gs handle
       int verbose = 0;
-      ogsMaskedRing = ogs_t::Setup(meshPatch->Nelements*meshPatch->Np, maskedRingGlobalIds,
-                                   mesh.comm, verbose, elliptic.platform);
-      free(maskedRingGlobalIds);
+      bool unique = true; //flag a unique node in every gather node
+      ogsMaskedRing.Setup(meshPatch.Nelements*meshPatch.Np,
+                          maskedRingGlobalIds, mesh.comm,
+                          ogs::Signed, ogs::Auto,
+                          unique, verbose, elliptic.platform);
 
       //determine overlap of each node with masked ogs
-      ogsMaskedRing->GatherScatter(patchWeight, ogs_dfloat, ogs_add, ogs_sym);
+      ogsMaskedRing.GatherScatter(patchWeight, 1, ogs::Add, ogs::Sym);
 
     } else {
       //determine overlap by combining halos
-      mesh.ringHalo->Combine(patchWeight, mesh.Np, ogs_dfloat);
+      mesh.ringHalo.Combine(patchWeight, mesh.Np);
     }
 
     //invert
-    for (int i=0;i<meshPatch->Nelements*meshPatch->Np;i++)
+    for (int i=0;i<meshPatch.Nelements*meshPatch.Np;i++)
       patchWeight[i] = (patchWeight[i] > 0.0) ? 1.0/patchWeight[i] : 0.0;
 
-    o_patchWeight = elliptic.platform.malloc(meshPatch->Nelements*meshPatch->Np*sizeof(dfloat), patchWeight);
+    o_patchWeight = elliptic.platform.malloc<dfloat>(patchWeight);
   }
 
   //build the coarse precon
   int Nc = 1;  //hard code
   int NpCoarse = mesh.Np;
   switch(mesh.elementType){
-    case TRIANGLES:
+    case Mesh::TRIANGLES:
       NpCoarse = ((Nc+1)*(Nc+2))/2; break;
-    case QUADRILATERALS:
+    case Mesh::QUADRILATERALS:
       NpCoarse = (Nc+1)*(Nc+1); break;
-    case TETRAHEDRA:
+    case Mesh::TETRAHEDRA:
       NpCoarse = ((Nc+1)*(Nc+2)*(Nc+3))/6; break;
-    case HEXAHEDRA:
+    case Mesh::HEXAHEDRA:
       NpCoarse = (Nc+1)*(Nc+1)*(Nc+1); break;
   }
 
   //build mesh and elliptic objects for this degree
-  mesh_t &meshC = mesh.SetupNewDegree(Nc);
-  elliptic_t &ellipticC = elliptic.SetupNewDegree(meshC);
+  mesh_t meshC = mesh.SetupNewDegree(Nc);
+  elliptic_t ellipticC = elliptic.SetupNewDegree(meshC);
 
   //build full A matrix and pass to parAlmond
+  if (Comm::World().rank()==0){
+    printf("-----------------------------Multigrid AMG Setup--------------------------------------------\n");
+  }
   parAlmond::parCOO A(elliptic.platform, meshC.comm);
   if (settings.compareSetting("DISCRETIZATION", "IPDG"))
     ellipticC.BuildOperatorMatrixIpdg(A);
@@ -177,8 +187,10 @@ OASPrecon::OASPrecon(elliptic_t& _elliptic):
   int size = meshC.size;
   hlong TotalRows = A.globalRowStarts[size];
   dlong numLocalRows = (dlong) (A.globalRowStarts[rank+1]-A.globalRowStarts[rank]);
-  dfloat *null = (dfloat *) malloc(numLocalRows*sizeof(dfloat));
-  for (dlong i=0;i<numLocalRows;i++) null[i] = 1.0/sqrt(TotalRows);
+  memory<dfloat> null(numLocalRows);
+  for (dlong i=0;i<numLocalRows;i++) {
+    null[i] = 1.0/sqrt(TotalRows);
+  }
 
   //set up AMG levels (treating the N=1 level as a matrix level)
   parAlmond.AMGSetup(A, ellipticC.allNeumann, null,ellipticC.allNeumannPenalty);
@@ -187,37 +199,25 @@ OASPrecon::OASPrecon(elliptic_t& _elliptic):
     //make an MG level to get prologation and coarsener
     dlong Nrows, Ncols;
     if (settings.compareSetting("DISCRETIZATION", "CONTINUOUS")) {
-      Nrows = elliptic.ogsMasked->Ngather;
-      Ncols = Nrows + elliptic.ogsMasked->NgatherHalo;
+      Nrows = elliptic.ogsMasked.Ngather;
+      Ncols = Nrows + elliptic.gHalo.Nhalo;
     } else {
       Nrows = mesh.Nelements*mesh.Np;
       Ncols = Nrows + mesh.totalHaloPairs*mesh.Np;
     }
 
-    level = new MGLevel(elliptic, Nrows, Ncols, Nc, NpCoarse);
-    level->meshC = &meshC;
-    level->ogsMaskedC = ellipticC.ogsMasked;
+    level = MGLevel(elliptic, Nrows, Ncols, Nc, NpCoarse);
+    level.meshC = meshC;
+    level.ellipticC = ellipticC;
 
     //coarse buffers
     Ncols = parAlmond.getNumCols(0);
-    rC = (dfloat*) calloc(Ncols,sizeof(dfloat));
-    zC = (dfloat*) calloc(Ncols,sizeof(dfloat));
-    o_rC = elliptic.platform.malloc(Ncols*sizeof(dfloat), rC);
-    o_zC = elliptic.platform.malloc(Ncols*sizeof(dfloat), zC);
+    rC.malloc(Ncols,0.0);
+    zC.malloc(Ncols,0.0);
+    o_rC = elliptic.platform.malloc<dfloat>(rC);
+    o_zC = elliptic.platform.malloc<dfloat>(zC);
   }
 
   //report
   parAlmond.Report();
 }
-
-OASPrecon::~OASPrecon() {
-  if (mesh.N>1) {
-    delete preconPatch;
-    if (mesh.size>1) delete ellipticPatch;
-    if (mesh.size>1) delete meshPatch;
-
-    delete &(level->elliptic);
-    if (level->mesh.ogs) level->mesh.ogs->Free();
-    delete level;
-  }
-}
\ No newline at end of file
diff --git a/solvers/elliptic/src/ellipticPreconParAlmond.cpp b/solvers/elliptic/src/ellipticPreconParAlmond.cpp
index 298e9af70..5b9bf3768 100644
--- a/solvers/elliptic/src/ellipticPreconParAlmond.cpp
+++ b/solvers/elliptic/src/ellipticPreconParAlmond.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,7 +27,7 @@ SOFTWARE.
 #include "ellipticPrecon.hpp"
 
 //AMG preconditioner via parAlmond
-void ParAlmondPrecon::Operator(occa::memory& o_r, occa::memory& o_Mr) {
+void ParAlmondPrecon::Operator(deviceMemory<dfloat>& o_r, deviceMemory<dfloat>& o_Mr) {
 
   //hand off to parAlmond
   parAlmond.Operator(o_r, o_Mr);
@@ -41,6 +41,9 @@ ParAlmondPrecon::ParAlmondPrecon(elliptic_t& _elliptic):
   parAlmond(elliptic.platform, settings, elliptic.mesh.comm) {
 
   //build full A matrix and pass to parAlmond
+  if (Comm::World().rank()==0){
+    printf("-----------------------------Multigrid AMG Setup--------------------------------------------\n");
+  }
   parAlmond::parCOO A(elliptic.platform, elliptic.mesh.comm);
   if (settings.compareSetting("DISCRETIZATION", "IPDG")) {
     elliptic.BuildOperatorMatrixIpdg(A);
@@ -52,12 +55,13 @@ ParAlmondPrecon::ParAlmondPrecon(elliptic_t& _elliptic):
   int rank = elliptic.mesh.rank;
   int size = elliptic.mesh.size;
   hlong TotalRows = A.globalRowStarts[size];
-  dlong numLocalRows = (dlong) (A.globalRowStarts[rank+1]-A.globalRowStarts[rank]);
-  dfloat *null = (dfloat *) malloc(numLocalRows*sizeof(dfloat));
-  for (dlong i=0;i<numLocalRows;i++) null[i] = 1.0/sqrt(TotalRows);
+  dlong numLocalRows = static_cast<dlong>(A.globalRowStarts[rank+1]-A.globalRowStarts[rank]);
+  memory<dfloat> null(numLocalRows);
+  for (dlong i=0;i<numLocalRows;i++) {
+    null[i] = 1.0/sqrt(TotalRows);
+  }
 
   parAlmond.AMGSetup(A, elliptic.allNeumann, null, elliptic.allNeumannPenalty);
-  free(null);
 
   parAlmond.Report();
 
@@ -66,7 +70,5 @@ ParAlmondPrecon::ParAlmondPrecon(elliptic_t& _elliptic):
   dlong parAlmondNrows = parAlmond.getNumRows(0);
   dlong parAlmondNcols = parAlmond.getNumCols(0);
   dlong parAlmondNhalo = parAlmondNcols - parAlmondNrows;
-  elliptic.Nhalo = mymax(elliptic.Nhalo, parAlmondNhalo);
+  _elliptic.Nhalo = std::max(_elliptic.Nhalo, parAlmondNhalo);
 }
-
-ParAlmondPrecon::~ParAlmondPrecon() {}
\ No newline at end of file
diff --git a/solvers/elliptic/src/ellipticPreconSEMFEM.cpp b/solvers/elliptic/src/ellipticPreconSEMFEM.cpp
index 7adc5736d..a6750455a 100644
--- a/solvers/elliptic/src/ellipticPreconSEMFEM.cpp
+++ b/solvers/elliptic/src/ellipticPreconSEMFEM.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,25 +27,27 @@ SOFTWARE.
 #include "ellipticPrecon.hpp"
 
 // Cast problem into spectrally-equivalent N=1 FEM space and precondition with AMG
-void SEMFEMPrecon::Operator(occa::memory& o_r, occa::memory& o_Mr) {
+void SEMFEMPrecon::Operator(deviceMemory<dfloat>& o_r, deviceMemory<dfloat>& o_Mr) {
 
-  if (mesh.elementType==TRIANGLES) {
+  linAlg_t& linAlg = elliptic.platform.linAlg();
+
+  if (mesh.elementType==Mesh::TRIANGLES) {
 
     // Mr = invDegree.*r
-    elliptic.linAlg.amxpy(elliptic.Ndofs, 1.0, elliptic.o_weightG, o_r, 0.0, o_Mr);
+    linAlg.amxpy(elliptic.Ndofs, 1.0, elliptic.o_weightG, o_r, 0.0, o_Mr);
 
-    elliptic.ogsMasked->Scatter(o_MrL, o_Mr, ogs_dfloat, ogs_add, ogs_notrans);
+    elliptic.ogsMasked.Scatter(o_MrL, o_Mr, 1, ogs::NoTrans);
     SEMFEMInterpKernel(mesh.Nelements, mesh.o_SEMFEMAnterp, o_MrL, o_rFEM);
-    FEMogs->Gather(o_GrFEM, o_rFEM, ogs_dfloat, ogs_add, ogs_trans);
+    FEMogs.Gather(o_GrFEM, o_rFEM, 1, ogs::Add, ogs::Trans);
 
     parAlmond.Operator(o_GrFEM, o_GzFEM);
 
-    FEMogs->Scatter(o_zFEM, o_GzFEM, ogs_dfloat, ogs_add, ogs_notrans);
+    FEMogs.Scatter(o_zFEM, o_GzFEM, 1, ogs::NoTrans);
     SEMFEMAnterpKernel(mesh.Nelements, mesh.o_SEMFEMAnterp, o_zFEM, o_MrL);
-    elliptic.ogsMasked->Gather(o_Mr, o_MrL, ogs_dfloat, ogs_add, ogs_trans);
+    elliptic.ogsMasked.Gather(o_Mr, o_MrL, 1, ogs::Add, ogs::Trans);
 
     // Mr = invDegree.*Mr
-    elliptic.linAlg.amx(elliptic.Ndofs, 1.0, elliptic.o_weightG, o_Mr);
+    linAlg.amx(elliptic.Ndofs, 1.0, elliptic.o_weightG, o_Mr);
 
   } else {
     //pass to parAlmond
@@ -61,49 +63,30 @@ SEMFEMPrecon::SEMFEMPrecon(elliptic_t& _elliptic):
   parAlmond(elliptic.platform, settings, mesh.comm) {
 
   //sanity checking
-  if (!settings.compareSetting("DISCRETIZATION", "CONTINUOUS") )
-    LIBP_ABORT(string("SEMFEM is supported for CONTINUOUS only"));
+  LIBP_ABORT("SEMFEM is supported for CONTINUOUS only",
+             !settings.compareSetting("DISCRETIZATION", "CONTINUOUS"));
 
   //make a low-order fem mesh from the sem mesh (also return globalIds of the enriched sem nodes, and faceNode mapping)
-  int Nfp = 0;
-  int *faceNodes = NULL;
-  hlong *globalIds = NULL;
-  femMesh = mesh.SetupSEMFEM(&globalIds, &Nfp, &faceNodes);
+  memory<hlong> globalIds;
+  memory<int> mapB;
+  femMesh = mesh.SetupSEMFEM(globalIds, mapB);
 
   //use the BCs to make a maskedGlobalIds array
   dlong Ntotal = mesh.NpFEM*mesh.Nelements;
-  hlong* maskedGlobalIds = (hlong *) calloc(Ntotal,sizeof(hlong));
-  memcpy(maskedGlobalIds, globalIds, Ntotal*sizeof(hlong));
-  if (mesh.elementType==TRIANGLES) { //build a new mask for NpFEM>Np node sets
-    // gather-scatter
-    int verbose = 0;
-    ogs_t *ogs = ogs_t::Setup(Ntotal, globalIds, mesh.comm, verbose, elliptic.platform);
-
-    //make a node-wise bc flag using the gsop (prioritize Dirichlet boundaries over Neumann)
-    const int largeNumber = 1<<20;
-    int *mapB = (int *) calloc(Ntotal,sizeof(int));
-    for (dlong e=0;e<mesh.Nelements;e++) {
-      for (int n=0;n<mesh.NpFEM;n++) mapB[n+e*mesh.NpFEM] = largeNumber;
-
-      for (int f=0;f<mesh.Nfaces;f++) {
-        int bc = elliptic.EToB[f+e*mesh.Nfaces];
-        if (bc>0) {
-          for (int n=0;n<Nfp;n++) {
-            int fid = faceNodes[n+f*Nfp];
-            mapB[fid+e*mesh.NpFEM] = mymin(bc,mapB[fid+e*mesh.Np]);
-          }
-        }
+  memory<hlong> maskedGlobalIds(Ntotal);
+  maskedGlobalIds.copyFrom(globalIds, Ntotal);
+
+  if (mesh.elementType==Mesh::TRIANGLES) { //build a new mask for NpFEM>Np node sets
+    //translate the node-wise bc flag
+    for (int n=0;n<Ntotal;n++) {
+      int bc = mapB[n];
+      if (bc>0) {
+        int BC = elliptic.BCType[bc];     //translate mesh's boundary flag
+        mapB[n] = BC;  //record it
+
+        if (mapB[n] == 1) maskedGlobalIds[n] = 0;   //Dirichlet boundary
       }
     }
-    ogs->GatherScatter(mapB, ogs_int, ogs_min, ogs_sym);
-
-    //use the bc flags to find masked ids
-    for (dlong n=0;n<mesh.Nelements*mesh.NpFEM;n++)
-      if (mapB[n] == 1) //Dirichlet boundary
-        maskedGlobalIds[n] = 0;
-
-    free(mapB);
-    ogs->Free();
   } else {
     //mask using the original mask
     for (dlong n=0;n<elliptic.Nmasked;n++)
@@ -112,39 +95,41 @@ SEMFEMPrecon::SEMFEMPrecon(elliptic_t& _elliptic):
 
   //build masked gs handle to gather from enriched sem nodes to assembled fem problem
   int verbose = 0;
-  ogs_t::Unique(maskedGlobalIds, Ntotal, mesh.comm);     //flag a unique node in every gather node
-  FEMogs = ogs_t::Setup(Ntotal, maskedGlobalIds, mesh.comm, verbose, elliptic.platform);
+  bool unique = true;
+  FEMogs.Setup(Ntotal, maskedGlobalIds,
+               mesh.comm, ogs::Signed, ogs::Auto,
+               unique, verbose, elliptic.platform);
 
   //make a map from the fem mesh's nodes to the (enriched) sem nodes
-  dlong *localIds = (dlong *) calloc(femMesh->Nelements*femMesh->Nverts,sizeof(dlong));
+  memory<dlong> localIds(femMesh.Nelements*femMesh.Nverts);
   for(dlong e=0;e<mesh.Nelements;++e){
     for (int n=0;n<mesh.NelFEM;n++) {
-      dlong id[femMesh->Nverts];
+      dlong id[femMesh.Nverts];
 
       //local ids in the subelement fem grid
       for (int i=0;i<mesh.Nverts;i++)
-        id[i] = e*mesh.NpFEM + mesh.FEMEToV[n*femMesh->Nverts+i];
+        id[i] = e*mesh.NpFEM + mesh.FEMEToV[n*femMesh.Nverts+i];
 
-      dlong femId = e*mesh.NelFEM*femMesh->Nverts+n*mesh.Nverts;
+      dlong femId = e*mesh.NelFEM*femMesh.Nverts+n*mesh.Nverts;
       switch(mesh.elementType){
-      case TRIANGLES:
+      case Mesh::TRIANGLES:
         localIds[femId+0] = id[0];
         localIds[femId+1] = id[1];
         localIds[femId+2] = id[2];
         break;
-      case QUADRILATERALS:
+      case Mesh::QUADRILATERALS:
         localIds[femId+0] = id[0];
         localIds[femId+1] = id[1];
         localIds[femId+2] = id[3];  //need to swap this as the Np nodes are ordered [0,1,3,2] in a degree 1 element
         localIds[femId+3] = id[2];
         break;
-      case TETRAHEDRA:
+      case Mesh::TETRAHEDRA:
         localIds[femId+0] = id[0];
         localIds[femId+1] = id[1];
         localIds[femId+2] = id[2];
         localIds[femId+3] = id[3];
         break;
-      case HEXAHEDRA:
+      case Mesh::HEXAHEDRA:
         localIds[femId+0] = id[0];
         localIds[femId+1] = id[1];
         localIds[femId+2] = id[3];  //need to swap this as the Np nodes are ordered [0,1,3,2,4,5,7,6] in a degree 1 element
@@ -159,99 +144,98 @@ SEMFEMPrecon::SEMFEMPrecon(elliptic_t& _elliptic):
   }
 
   //make a fem elliptic solver
-  femElliptic = new elliptic_t(elliptic.platform, *femMesh,
-                               elliptic.settings, elliptic.lambda);
-  femElliptic->ogsMasked = FEMogs; //only for getting Ngather when building matrix
+  femElliptic.platform = elliptic.platform;
+  femElliptic.mesh = femMesh;
+  femElliptic.settings = elliptic.settings;
+  femElliptic.lambda = elliptic.lambda;
+
+  femElliptic.ogsMasked = FEMogs; //only for getting Ngather when building matrix
 
   // number of degrees of freedom on this rank (after gathering)
-  hlong Ngather = FEMogs->Ngather;
+  hlong Ngather = FEMogs.Ngather;
 
   // create a global numbering system
-  hlong *globalIds2 = (hlong *) calloc(Ngather,sizeof(hlong));
+  memory<hlong> globalIds2(Ngather);
 
   // every gathered degree of freedom has its own global id
-  hlong *globalStarts = (hlong *) calloc(mesh.size+1,sizeof(hlong));
-  MPI_Allgather(&Ngather, 1, MPI_HLONG, globalStarts+1, 1, MPI_HLONG, mesh.comm);
-  for(int r=0;r<mesh.size;++r)
-    globalStarts[r+1] = globalStarts[r]+globalStarts[r+1];
+  hlong globalGatherOffset = Ngather;
+  mesh.comm.Scan(Ngather, globalGatherOffset);
+  globalGatherOffset = globalGatherOffset - Ngather;
 
   //use the offsets to set a consecutive global numbering
-  for (dlong n =0;n<FEMogs->Ngather;n++) {
-    globalIds2[n] = n + globalStarts[mesh.rank];
+  for (dlong n =0;n<FEMogs.Ngather;n++) {
+    globalIds2[n] = n + globalGatherOffset;
   }
-  free(globalStarts);
 
   //scatter this numbering to the original nodes
   Ntotal = mesh.NpFEM*mesh.Nelements;
-  hlong* maskedGlobalNumbering = (hlong *) calloc(Ntotal,sizeof(hlong));
-  for (dlong n=0;n<Ntotal;n++) maskedGlobalNumbering[n] = -1;
-  FEMogs->Scatter(maskedGlobalNumbering, globalIds2, ogs_hlong, ogs_add, ogs_notrans);
-  free(globalIds2);
+  memory<hlong> maskedGlobalNumbering(Ntotal, -1);
+  FEMogs.Scatter(maskedGlobalNumbering, globalIds2, 1, ogs::NoTrans);
 
   //transfer the consecutive global numbering to the fem mesh
-  Ntotal = femMesh->Np*femMesh->Nelements;
-  femElliptic->maskedGlobalNumbering = (hlong *) calloc(Ntotal,sizeof(hlong));
+  Ntotal = femMesh.Np*femMesh.Nelements;
+  femElliptic.maskedGlobalNumbering.malloc(Ntotal);
 
-  for (dlong e=0;e<femMesh->Nelements;e++) {
-    for (int n=0;n<femMesh->Np;n++) {
-      dlong id = e*femMesh->Np + n;
+  for (dlong e=0;e<femMesh.Nelements;e++) {
+    for (int n=0;n<femMesh.Np;n++) {
+      dlong id = e*femMesh.Np + n;
       dlong localId = localIds[id];
-      femElliptic->maskedGlobalNumbering[id] = maskedGlobalNumbering[localId];
+      femElliptic.maskedGlobalNumbering[id] = maskedGlobalNumbering[localId];
     }
   }
-  free(localIds); free(maskedGlobalNumbering);
 
   //finally, build the fem matrix and pass to parAlmond
-  parAlmond::parCOO A(elliptic.platform, femMesh->comm);
-  femElliptic->BuildOperatorMatrixContinuous(A);
+  if (mesh.rank==0){
+    printf("-----------------------------Multigrid AMG Setup--------------------------------------------\n");
+  }
+  parAlmond::parCOO A(elliptic.platform, femMesh.comm);
+  femElliptic.BuildOperatorMatrixContinuous(A);
 
   //populate null space unit vector
-  int rank = femMesh->rank;
-  int size = femMesh->size;
+  int rank = femMesh.rank;
+  int size = femMesh.size;
   hlong TotalRows = A.globalRowStarts[size];
-  dlong numLocalRows = (dlong) (A.globalRowStarts[rank+1]-A.globalRowStarts[rank]);
-  dfloat *null = (dfloat *) malloc(numLocalRows*sizeof(dfloat));
-  for (dlong i=0;i<numLocalRows;i++) null[i] = 1.0/sqrt(TotalRows);
+  dlong numLocalRows = static_cast<dlong>(A.globalRowStarts[rank+1]-A.globalRowStarts[rank]);
+
+  memory<dfloat> null(numLocalRows);
+  for (dlong i=0;i<numLocalRows;i++) {
+    null[i] = 1.0/sqrt(TotalRows);
+  }
 
   parAlmond.AMGSetup(A, elliptic.allNeumann, null, elliptic.allNeumannPenalty);
-  free(null);
 
   parAlmond.Report();
 
-  if (mesh.elementType==TRIANGLES) {
+  if (mesh.elementType==Mesh::TRIANGLES) {
     // build interp and anterp
-    dfloat *SEMFEMAnterp = (dfloat*) calloc(mesh.NpFEM*mesh.Np, sizeof(dfloat));
+    memory<dfloat> SEMFEMAnterp(mesh.NpFEM*mesh.Np);
     for(int n=0;n<mesh.NpFEM;++n){
       for(int m=0;m<mesh.Np;++m){
         SEMFEMAnterp[n+m*mesh.NpFEM] = mesh.SEMFEMInterp[n*mesh.Np+m];
       }
     }
 
-    mesh.o_SEMFEMInterp = elliptic.platform.malloc(mesh.NpFEM*mesh.Np*sizeof(dfloat),mesh.SEMFEMInterp);
-    mesh.o_SEMFEMAnterp = elliptic.platform.malloc(mesh.NpFEM*mesh.Np*sizeof(dfloat),SEMFEMAnterp);
-
-    free(SEMFEMAnterp);
+    mesh.o_SEMFEMInterp = elliptic.platform.malloc<dfloat>(mesh.SEMFEMInterp);
+    mesh.o_SEMFEMAnterp = elliptic.platform.malloc<dfloat>(SEMFEMAnterp);
 
-    dfloat *dummy = (dfloat*) calloc(mesh.Nelements*mesh.NpFEM,sizeof(dfloat)); //need this to avoid uninitialized memory warnings
-    o_rFEM = elliptic.platform.malloc(mesh.Nelements*mesh.NpFEM*sizeof(dfloat), dummy);
-    o_zFEM = elliptic.platform.malloc(mesh.Nelements*mesh.NpFEM*sizeof(dfloat), dummy);
-    free(dummy);
+    memory<dfloat> dummy(mesh.Nelements*mesh.NpFEM,0.0); //need this to avoid uninitialized memory warnings
+    o_rFEM = elliptic.platform.malloc<dfloat>(dummy);
+    o_zFEM = elliptic.platform.malloc<dfloat>(dummy);
 
     dlong Ncols = parAlmond.getNumCols(0);
-    dummy = (dfloat*) calloc(Ncols,sizeof(dfloat));
-    o_GrFEM = elliptic.platform.malloc(Ncols*sizeof(dfloat),dummy);
-    o_GzFEM = elliptic.platform.malloc(Ncols*sizeof(dfloat),dummy);
-    free(dummy);
+    dummy.malloc(Ncols,0.0);
+    o_GrFEM = elliptic.platform.malloc<dfloat>(dummy);
+    o_GzFEM = elliptic.platform.malloc<dfloat>(dummy);
 
-    o_MrL = elliptic.platform.malloc(mesh.Np*mesh.Nelements*sizeof(dfloat));
+    o_MrL = elliptic.platform.malloc<dfloat>(mesh.Np*mesh.Nelements);
 
     //build kernels
-    occa::properties kernelInfo = mesh.props;
+    properties_t kernelInfo = mesh.props;
 
     kernelInfo["defines/" "p_Np"]= mesh.Np;
     kernelInfo["defines/" "p_NpFEM"]= mesh.NpFEM;
 
-    int NblockV = 512/mesh.NpFEM;
+    int NblockV = std::max(256/mesh.NpFEM, 1);
     kernelInfo["defines/" "p_NblockV"]= NblockV;
 
     SEMFEMInterpKernel = elliptic.platform.buildKernel(DELLIPTIC "/okl/ellipticSEMFEMInterp.okl",
@@ -261,12 +245,3 @@ SEMFEMPrecon::SEMFEMPrecon(elliptic_t& _elliptic):
                                      "ellipticSEMFEMAnterp", kernelInfo);
   }
 }
-
-SEMFEMPrecon::~SEMFEMPrecon() {
-  femElliptic->ogsMasked->Free();
-
-  femMesh->halo->Free();
-
-  SEMFEMInterpKernel.free();
-  SEMFEMAnterpKernel.free();
-}
\ No newline at end of file
diff --git a/solvers/elliptic/src/ellipticRun.cpp b/solvers/elliptic/src/ellipticRun.cpp
index bc48c79bc..ef9f6015a 100644
--- a/solvers/elliptic/src/ellipticRun.cpp
+++ b/solvers/elliptic/src/ellipticRun.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,108 +25,119 @@ SOFTWARE.
 */
 
 #include "elliptic.hpp"
+#include "timer.hpp"
 
 void elliptic_t::Run(){
 
   //setup linear solver
   hlong NglobalDofs;
   if (settings.compareSetting("DISCRETIZATION", "CONTINUOUS")) {
-    NglobalDofs = ogsMasked->NgatherGlobal*Nfields;
+    NglobalDofs = ogsMasked.NgatherGlobal*Nfields;
   } else {
     NglobalDofs = mesh.NelementsGlobal*mesh.Np*Nfields;
   }
-  linearSolver_t *linearSolver = linearSolver_t::Setup(Ndofs, Nhalo,
-                                                       platform, settings, mesh.comm);
 
-  occa::properties kernelInfo = mesh.props; //copy base occa properties
+  linearSolver_t linearSolver;
+  if (settings.compareSetting("LINEAR SOLVER","NBPCG")){
+    linearSolver.Setup<LinearSolver::nbpcg>(Ndofs, Nhalo, platform, settings, comm);
+  } else if (settings.compareSetting("LINEAR SOLVER","NBFPCG")){
+    linearSolver.Setup<LinearSolver::nbfpcg>(Ndofs, Nhalo, platform, settings, comm);
+  } else if (settings.compareSetting("LINEAR SOLVER","PCG")){
+    linearSolver.Setup<LinearSolver::pcg>(Ndofs, Nhalo, platform, settings, comm);
+  } else if (settings.compareSetting("LINEAR SOLVER","PGMRES")){
+    linearSolver.Setup<LinearSolver::pgmres>(Ndofs, Nhalo, platform, settings, comm);
+  } else if (settings.compareSetting("LINEAR SOLVER","PMINRES")){
+    linearSolver.Setup<LinearSolver::pminres>(Ndofs, Nhalo, platform, settings, comm);
+  }
+
+  properties_t kernelInfo = mesh.props; //copy base occa properties
 
-  string dataFileName;
+  std::string dataFileName;
   settings.getSetting("DATA FILE", dataFileName);
   kernelInfo["includes"] += dataFileName;
 
   //add standard boundary functions
-  char *boundaryHeaderFileName;
+  std::string boundaryHeaderFileName;
   if (mesh.dim==2)
-    boundaryHeaderFileName = strdup(DELLIPTIC "/data/ellipticBoundary2D.h");
+    boundaryHeaderFileName = std::string(DELLIPTIC "/data/ellipticBoundary2D.h");
   else if (mesh.dim==3)
-    boundaryHeaderFileName = strdup(DELLIPTIC "/data/ellipticBoundary3D.h");
+    boundaryHeaderFileName = std::string(DELLIPTIC "/data/ellipticBoundary3D.h");
   kernelInfo["includes"] += boundaryHeaderFileName;
 
-  int Nmax = mymax(mesh.Np, mesh.Nfaces*mesh.Nfp);
+  int Nmax = std::max(mesh.Np, mesh.Nfaces*mesh.Nfp);
   kernelInfo["defines/" "p_Nmax"]= Nmax;
 
   kernelInfo["defines/" "p_Nfields"]= Nfields;
 
   // set kernel name suffix
-  char *suffix;
-  if(mesh.elementType==TRIANGLES)
-    suffix = strdup("Tri2D");
-  if(mesh.elementType==QUADRILATERALS)
-    suffix = strdup("Quad2D");
-
-  if(mesh.elementType==TETRAHEDRA)
-    suffix = strdup("Tet3D");
-  if(mesh.elementType==HEXAHEDRA)
-    suffix = strdup("Hex3D");
-
-  char fileName[BUFSIZ], kernelName[BUFSIZ];
-
-  if(mesh.elementType==QUADRILATERALS){
+  std::string suffix;
+  if(mesh.elementType==Mesh::TRIANGLES) {
+    suffix = "Tri2D";
+  } else if(mesh.elementType==Mesh::QUADRILATERALS) {
     if(mesh.dim==2)
-      suffix = strdup("Quad2D");
+      suffix = "Quad2D";
     else
-      suffix = strdup("Quad3D");
+      suffix = "Quad3D";
+  } else if(mesh.elementType==Mesh::TETRAHEDRA) {
+    suffix = "Tet3D";
+  } else { //mesh.elementType==Mesh::HEXAHEDRA)
+    suffix = "Hex3D";
   }
 
-  sprintf(fileName, DELLIPTIC "/okl/ellipticRhs%s.okl", suffix);
-  sprintf(kernelName, "ellipticRhs%s", suffix);
-  occa::kernel forcingKernel = platform.buildKernel(fileName, kernelName,
+  std::string oklFilePrefix = DELLIPTIC "/okl/";
+  std::string oklFileSuffix = ".okl";
+
+  std::string fileName, kernelName;
+
+  fileName   = oklFilePrefix + "ellipticRhs" + suffix + oklFileSuffix;
+  kernelName = "ellipticRhs" + suffix;
+  kernel_t forcingKernel = platform.buildKernel(fileName, kernelName,
                                                     kernelInfo);
 
-  occa::kernel rhsBCKernel, addBCKernel;
+  kernel_t rhsBCKernel, addBCKernel;
   if (settings.compareSetting("DISCRETIZATION","IPDG")) {
-    sprintf(fileName, DELLIPTIC "/okl/ellipticRhsBCIpdg%s.okl", suffix);
-    sprintf(kernelName, "ellipticRhsBCIpdg%s", suffix);
+    fileName   = oklFilePrefix + "ellipticRhsBCIpdg" + suffix + oklFileSuffix;
+    kernelName = "ellipticRhsBCIpdg" + suffix;
 
     rhsBCKernel = platform.buildKernel(fileName,kernelName, kernelInfo);
   } else if (settings.compareSetting("DISCRETIZATION","CONTINUOUS")) {
-    sprintf(fileName, DELLIPTIC "/okl/ellipticRhsBC%s.okl", suffix);
-    sprintf(kernelName, "ellipticRhsBC%s", suffix);
+    fileName   = oklFilePrefix + "ellipticRhsBC" + suffix + oklFileSuffix;
+    kernelName = "ellipticRhsBC" + suffix;
 
     rhsBCKernel = platform.buildKernel(fileName, kernelName, kernelInfo);
 
-    sprintf(fileName, DELLIPTIC "/okl/ellipticAddBC%s.okl", suffix);
-    sprintf(kernelName, "ellipticAddBC%s", suffix);
+    fileName   = oklFilePrefix + "ellipticAddBC" + suffix + oklFileSuffix;
+    kernelName = "ellipticAddBC" + suffix;
 
     addBCKernel = platform.buildKernel(fileName, kernelName, kernelInfo);
   }
 
   //create occa buffers
   dlong Nall = mesh.Np*(mesh.Nelements+mesh.totalHaloPairs);
-  dfloat *rL = (dfloat*) calloc(Nall, sizeof(dfloat));
-  dfloat *xL = (dfloat*) calloc(Nall, sizeof(dfloat));
-  occa::memory o_rL = platform.malloc(Nall*sizeof(dfloat), rL);
-  occa::memory o_xL = platform.malloc(Nall*sizeof(dfloat), xL);
+  memory<dfloat> rL(Nall, 0.0);
+  memory<dfloat> xL(Nall, 0.0);
+  deviceMemory<dfloat> o_rL = platform.malloc<dfloat>(rL);
+  deviceMemory<dfloat> o_xL = platform.malloc<dfloat>(xL);
 
-  occa::memory o_r, o_x;
+  deviceMemory<dfloat> o_r, o_x;
   if (settings.compareSetting("DISCRETIZATION","IPDG")) {
     o_r = o_rL;
     o_x = o_xL;
   } else {
-    dlong Ng = ogsMasked->Ngather;
-    dlong Nghalo = ogsMasked->NgatherHalo;
+    dlong Ng = ogsMasked.Ngather;
+    dlong Nghalo = gHalo.Nhalo;
     dlong Ngall = Ng + Nghalo;
-    o_r = platform.malloc(Ngall*sizeof(dfloat));
-    o_x = platform.malloc(Ngall*sizeof(dfloat));
+    o_r = platform.malloc<dfloat>(Ngall);
+    o_x = platform.malloc<dfloat>(Ngall);
   }
 
   //storage for M*q during reporting
-  occa::memory o_MxL = platform.malloc(Nall*sizeof(dfloat), xL);
+  deviceMemory<dfloat> o_MxL = platform.malloc<dfloat>(xL);
   mesh.MassMatrixKernelSetup(Nfields); // mass matrix operator
 
   //populate rhs forcing
   forcingKernel(mesh.Nelements,
-                mesh.o_ggeo,
+                mesh.o_wJ,
                 mesh.o_MM,
                 mesh.o_x,
                 mesh.o_y,
@@ -151,6 +162,7 @@ void elliptic_t::Run(){
                 o_rL);
   } else if (settings.compareSetting("DISCRETIZATION","CONTINUOUS")) {
     rhsBCKernel(mesh.Nelements,
+                mesh.o_wJ,
                 mesh.o_ggeo,
                 mesh.o_sgeo,
                 mesh.o_D,
@@ -168,25 +180,23 @@ void elliptic_t::Run(){
 
   // gather rhs to globalDofs if c0
   if(settings.compareSetting("DISCRETIZATION","CONTINUOUS")){
-    ogsMasked->Gather(o_r, o_rL, ogs_dfloat, ogs_add, ogs_trans);
-    ogsMasked->Gather(o_x, o_xL, ogs_dfloat, ogs_add, ogs_notrans);
+    ogsMasked.Gather(o_r, o_rL, 1, ogs::Add, ogs::Trans);
+    ogsMasked.Gather(o_x, o_xL, 1, ogs::Add, ogs::NoTrans);
   }
 
   int maxIter = 5000;
   int verbose = settings.compareSetting("VERBOSE", "TRUE") ? 1 : 0;
 
-  MPI_Barrier(mesh.comm);
-  double startTime = MPI_Wtime();
+  timePoint_t start = GlobalPlatformTime(platform);
 
   //call the solver
-  dfloat tol = 1e-8;
-  int iter = Solve(*linearSolver, o_x, o_r, tol, maxIter, verbose);
-
+  dfloat tol = (sizeof(dfloat)==sizeof(double)) ? 1.0e-8 : 1.0e-5;
+  int iter = Solve(linearSolver, o_x, o_r, tol, maxIter, verbose);
 
   //add the boundary data to the masked nodes
   if(settings.compareSetting("DISCRETIZATION","CONTINUOUS")){
     // scatter x to LocalDofs if c0
-    ogsMasked->Scatter(o_xL, o_x, ogs_dfloat, ogs_add, ogs_notrans);
+    ogsMasked.Scatter(o_xL, o_x, 1, ogs::NoTrans);
     //fill masked nodes with BC data
     addBCKernel(mesh.Nelements,
                 mesh.o_x,
@@ -196,9 +206,8 @@ void elliptic_t::Run(){
                 o_xL);
   }
 
-  MPI_Barrier(mesh.comm);
-  double endTime = MPI_Wtime();
-  double elapsedTime = endTime - startTime;
+  timePoint_t end = GlobalPlatformTime(platform);
+  double elapsedTime = ElapsedTime(start, end);
 
   if ((mesh.rank==0) && verbose){
     printf("%d, " hlongFormat ", %g, %d, %g, %g; global: N, dofs, elapsed, iterations, time per node, nodes*iterations/time %s\n",
@@ -217,7 +226,7 @@ void elliptic_t::Run(){
     o_xL.copyTo(xL);
 
     // output field files
-    string name;
+    std::string name;
     settings.getSetting("OUTPUT FILE NAME", name);
     char fname[BUFSIZ];
     sprintf(fname, "%s_%04d.vtu", name.c_str(), mesh.rank);
@@ -231,15 +240,9 @@ void elliptic_t::Run(){
     mesh.MassMatrixApply(o_xL, o_MxL);
 
     dlong Nentries = mesh.Nelements*mesh.Np*Nfields;
-    dfloat norm2 = sqrt(linAlg.innerProd(Nentries, o_xL, o_MxL, mesh.comm));
+    dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_xL, o_MxL, mesh.comm));
 
     if(mesh.rank==0)
       printf("Solution norm = %17.15lg\n", norm2);
   }
-
-  free(rL); free(xL);
-  o_rL.free(); o_xL.free();
-  o_r.free(); o_x.free();
-  o_MxL.free();
-  delete linearSolver;
 }
diff --git a/solvers/elliptic/src/ellipticSettings.cpp b/solvers/elliptic/src/ellipticSettings.cpp
index 1078b5fc3..05080cc58 100644
--- a/solvers/elliptic/src/ellipticSettings.cpp
+++ b/solvers/elliptic/src/ellipticSettings.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,7 +27,7 @@ SOFTWARE.
 #include "elliptic.hpp"
 
 //settings for elliptic solver
-ellipticSettings_t::ellipticSettings_t(const MPI_Comm& _comm):
+ellipticSettings_t::ellipticSettings_t(const comm_t& _comm):
   settings_t(_comm) {
 
   //common settings used when the elliptic solver
@@ -55,7 +55,7 @@ void ellipticAddRunSettings(settings_t& settings) {
 }
 
 void ellipticAddSettings(settings_t& settings,
-                         const string prefix) {
+                         const std::string prefix) {
   settings.newSetting(prefix+"DISCRETIZATION",
                       "CONTINUOUS",
                       "Type of Finite Element Discretization",
@@ -99,10 +99,7 @@ void ellipticAddSettings(settings_t& settings,
 
 void ellipticSettings_t::report() {
 
-  int rank;
-  MPI_Comm_rank(comm, &rank);
-
-  if (rank==0) {
+  if (comm.rank()==0) {
     std::cout << "Elliptic Settings:\n\n";
     reportSetting("DATA FILE");
 
@@ -129,15 +126,15 @@ void ellipticSettings_t::report() {
 
 void ellipticSettings_t::parseFromFile(platformSettings_t& platformSettings,
                                        meshSettings_t& meshSettings,
-                                       const string filename) {
+                                       const std::string filename) {
   //read all settings from file
   settings_t s(comm);
   s.readSettingsFromFile(filename);
 
   for(auto it = s.settings.begin(); it != s.settings.end(); ++it) {
-    setting_t* set = it->second;
-    const string name = set->getName();
-    const string val = set->getVal<string>();
+    setting_t& set = it->second;
+    const std::string name = set.getName();
+    const std::string val = set.getVal<std::string>();
     if (platformSettings.hasSetting(name))
       platformSettings.changeSetting(name, val);
     else if (meshSettings.hasSetting(name))
@@ -145,9 +142,7 @@ void ellipticSettings_t::parseFromFile(platformSettings_t& platformSettings,
     else if (hasSetting(name)) //self
       changeSetting(name, val);
     else  {
-      stringstream ss;
-      ss << "Unknown setting: [" << name << "] requested";
-      LIBP_ABORT(ss.str());
+      LIBP_FORCE_ABORT("Unknown setting: [" << name << "] requested");
     }
   }
 }
diff --git a/solvers/elliptic/src/ellipticSetup.cpp b/solvers/elliptic/src/ellipticSetup.cpp
index 6e8de66d9..105ad63ed 100644
--- a/solvers/elliptic/src/ellipticSetup.cpp
+++ b/solvers/elliptic/src/ellipticSetup.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,157 +27,154 @@ SOFTWARE.
 #include "elliptic.hpp"
 #include "ellipticPrecon.hpp"
 
-elliptic_t& elliptic_t::Setup(platform_t& platform, mesh_t& mesh,
-                              ellipticSettings_t& settings, dfloat lambda,
-                              const int NBCTypes, const int *BCType){
+void elliptic_t::Setup(platform_t& _platform, mesh_t& _mesh,
+                       settings_t& _settings, dfloat _lambda,
+                       const int _NBCTypes, const memory<int> _BCType){
 
-  elliptic_t* elliptic = new elliptic_t(platform, mesh, settings, lambda);
+  platform = _platform;
+  mesh = _mesh;
+  comm = _mesh.comm;
+  settings = _settings;
+  lambda = _lambda;
 
-  elliptic->Nfields = 1;
+  Nfields = 1;
 
-  elliptic->disc_ipdg = settings.compareSetting("DISCRETIZATION","IPDG");
-  elliptic->disc_c0   = settings.compareSetting("DISCRETIZATION","CONTINUOUS");
+  //Trigger JIT kernel builds
+  ogs::InitializeKernels(platform, ogs::Dfloat, ogs::Add);
+
+  disc_ipdg = settings.compareSetting("DISCRETIZATION","IPDG");
+  disc_c0   = settings.compareSetting("DISCRETIZATION","CONTINUOUS");
 
   //setup linear algebra module
-  platform.linAlg.InitKernels({"add", "sum", "scale",
+  platform.linAlg().InitKernels({"add", "sum", "scale",
                                 "axpy", "zaxpy",
                                 "amx", "amxpy", "zamxpy",
                                 "adx", "adxpy", "zadxpy",
-                                "innerProd", "weightedInnerProd",
-                                "norm2", "weightedNorm2"});
+                                "innerProd", "norm2"});
 
   /*setup trace halo exchange */
-  elliptic->traceHalo = mesh.HaloTraceSetup(elliptic->Nfields);
+  traceHalo = mesh.HaloTraceSetup(Nfields);
 
   // Boundary Type translation. Just defaults.
-  elliptic->BCType = (int*) calloc(NBCTypes,sizeof(int));
-  memcpy(elliptic->BCType,BCType,NBCTypes*sizeof(int));
+  NBCTypes = _NBCTypes;
+  BCType.malloc(NBCTypes);
+  BCType.copyFrom(_BCType);
 
   //setup boundary flags and make mask and masked ogs
-  elliptic->BoundarySetup();
+  BoundarySetup();
 
   if (settings.compareSetting("DISCRETIZATION","IPDG")) {
     //tau (penalty term in IPDG)
-    if (mesh.elementType==TRIANGLES ||
-        mesh.elementType==QUADRILATERALS){
-      elliptic->tau = 2.0*(mesh.N+1)*(mesh.N+2)/2.0;
+    if (mesh.elementType==Mesh::TRIANGLES ||
+        mesh.elementType==Mesh::QUADRILATERALS){
+      tau = 2.0*(mesh.N+1)*(mesh.N+2)/2.0;
       if(mesh.dim==3)
-        elliptic->tau *= 1.5;
+        tau *= 1.5;
     } else
-      elliptic->tau = 2.0*(mesh.N+1)*(mesh.N+3);
+      tau = 2.0*(mesh.N+1)*(mesh.N+3);
 
     //buffer for gradient
     dlong Ntotal = mesh.Np*(mesh.Nelements+mesh.totalHaloPairs);
-    elliptic->grad = (dfloat*) calloc(Ntotal*4, sizeof(dfloat));
-    elliptic->o_grad  = platform.malloc(Ntotal*4*sizeof(dfloat), elliptic->grad);
+    grad.malloc(Ntotal*4);
+    o_grad = platform.malloc<dfloat>(grad);
   } else {
-    elliptic->tau = 0.0;
+    tau = 0.0;
 
     //buffer for local Ax
     dlong Ntotal = mesh.Np*mesh.Nelements;
-    elliptic->o_AqL  = platform.malloc(Ntotal*sizeof(dfloat));
+    o_AqL = platform.malloc<dfloat>(Ntotal);
   }
 
   // OCCA build stuff
-  occa::properties kernelInfo = mesh.props; //copy base occa properties
+  properties_t kernelInfo = mesh.props; //copy base occa properties
 
   // set kernel name suffix
-  char *suffix;
-  if(mesh.elementType==TRIANGLES){
+  std::string suffix;
+  if(mesh.elementType==Mesh::TRIANGLES){
     if(mesh.dim==2)
-      suffix = strdup("Tri2D");
+      suffix = "Tri2D";
     else
-      suffix = strdup("Tri3D");
-  } else if(mesh.elementType==QUADRILATERALS){
+      suffix = "Tri3D";
+  } else if(mesh.elementType==Mesh::QUADRILATERALS){
     if(mesh.dim==2)
-      suffix = strdup("Quad2D");
+      suffix = "Quad2D";
     else
-      suffix = strdup("Quad3D");
-  } else if(mesh.elementType==TETRAHEDRA)
-    suffix = strdup("Tet3D");
-  else if(mesh.elementType==HEXAHEDRA)
-    suffix = strdup("Hex3D");
+      suffix = "Quad3D";
+  } else if(mesh.elementType==Mesh::TETRAHEDRA)
+    suffix = "Tet3D";
+  else if(mesh.elementType==Mesh::HEXAHEDRA)
+    suffix = "Hex3D";
+
+  std::string oklFilePrefix = DELLIPTIC "/okl/";
+  std::string oklFileSuffix = ".okl";
 
-  char fileName[BUFSIZ], kernelName[BUFSIZ];
+  std::string fileName, kernelName;
 
   //add standard boundary functions
-  char *boundaryHeaderFileName;
+  std::string boundaryHeaderFileName;
   if (mesh.dim==2)
-    boundaryHeaderFileName = strdup(DELLIPTIC "/data/ellipticBoundary2D.h");
+    boundaryHeaderFileName = std::string(DELLIPTIC "/data/ellipticBoundary2D.h");
   else if (mesh.dim==3)
-    boundaryHeaderFileName = strdup(DELLIPTIC "/data/ellipticBoundary3D.h");
+    boundaryHeaderFileName = std::string(DELLIPTIC "/data/ellipticBoundary3D.h");
   kernelInfo["includes"] += boundaryHeaderFileName;
 
   int blockMax = 256;
   if (platform.device.mode() == "CUDA") blockMax = 512;
 
-  int NblockV = mymax(1,blockMax/mesh.Np);
+  int NblockV = std::max(1,blockMax/mesh.Np);
   kernelInfo["defines/" "p_NblockV"]= NblockV;
 
   // Ax kernel
   if (settings.compareSetting("DISCRETIZATION","CONTINUOUS")) {
-    sprintf(fileName,  DELLIPTIC "/okl/ellipticAx%s.okl", suffix);
-    if(mesh.elementType==HEXAHEDRA){
+    fileName   = oklFilePrefix + "ellipticAx" + suffix + oklFileSuffix;
+    if(mesh.elementType==Mesh::HEXAHEDRA){
       if(mesh.settings.compareSetting("ELEMENT MAP", "TRILINEAR"))
-        sprintf(kernelName, "ellipticPartialAxTrilinear%s", suffix);
+        kernelName = "ellipticPartialAxTrilinear" + suffix;
       else
-        sprintf(kernelName, "ellipticPartialAx%s", suffix);
+        kernelName = "ellipticPartialAx" + suffix;
     } else{
-      sprintf(kernelName, "ellipticPartialAx%s", suffix);
+      kernelName = "ellipticPartialAx" + suffix;
     }
 
-    elliptic->partialAxKernel = platform.buildKernel(fileName, kernelName,
-                                     kernelInfo);
+    partialAxKernel = platform.buildKernel(fileName, kernelName,
+                                           kernelInfo);
 
   } else if (settings.compareSetting("DISCRETIZATION","IPDG")) {
-    int Nmax = mymax(mesh.Np, mesh.Nfaces*mesh.Nfp);
+    int Nmax = std::max(mesh.Np, mesh.Nfaces*mesh.Nfp);
     kernelInfo["defines/" "p_Nmax"]= Nmax;
 
-    sprintf(fileName, DELLIPTIC "/okl/ellipticGradient%s.okl", suffix);
-    sprintf(kernelName, "ellipticPartialGradient%s", suffix);
-    elliptic->partialGradientKernel = platform.buildKernel(fileName, kernelName,
+    fileName   = oklFilePrefix + "ellipticGradient" + suffix + oklFileSuffix;
+    kernelName = "ellipticPartialGradient" + suffix;
+    partialGradientKernel = platform.buildKernel(fileName, kernelName,
                                                   kernelInfo);
 
-    sprintf(fileName, DELLIPTIC "/okl/ellipticAxIpdg%s.okl", suffix);
-    sprintf(kernelName, "ellipticPartialAxIpdg%s", suffix);
-    elliptic->partialIpdgKernel = platform.buildKernel(fileName, kernelName,
+    fileName   = oklFilePrefix + "ellipticAxIpdg" + suffix + oklFileSuffix;
+    kernelName = "ellipticPartialAxIpdg" + suffix;
+    partialIpdgKernel = platform.buildKernel(fileName, kernelName,
                                               kernelInfo);
   }
 
   /* Preconditioner Setup */
   if (settings.compareSetting("DISCRETIZATION", "CONTINUOUS")) {
-    elliptic->Ndofs = elliptic->ogsMasked->Ngather*elliptic->Nfields;
-    elliptic->Nhalo = elliptic->ogsMasked->NgatherHalo*elliptic->Nfields;
+    Ndofs = ogsMasked.Ngather*Nfields;
+    Nhalo = gHalo.Nhalo*Nfields;
   } else {
-    elliptic->Ndofs = mesh.Nelements*mesh.Np*elliptic->Nfields;
-    elliptic->Nhalo = mesh.totalHaloPairs*mesh.Np*elliptic->Nfields;
+    Ndofs = mesh.Nelements*mesh.Np*Nfields;
+    Nhalo = mesh.totalHaloPairs*mesh.Np*Nfields;
   }
 
   if       (settings.compareSetting("PRECONDITIONER", "JACOBI"))
-    elliptic->precon = new JacobiPrecon(*elliptic);
+    precon.Setup<JacobiPrecon>(*this);
   else if(settings.compareSetting("PRECONDITIONER", "MASSMATRIX"))
-    elliptic->precon = new MassMatrixPrecon(*elliptic);
+    precon.Setup<MassMatrixPrecon>(*this);
   else if(settings.compareSetting("PRECONDITIONER", "PARALMOND"))
-    elliptic->precon = new ParAlmondPrecon(*elliptic);
+    precon.Setup<ParAlmondPrecon>(*this);
   else if(settings.compareSetting("PRECONDITIONER", "MULTIGRID"))
-    elliptic->precon = new MultiGridPrecon(*elliptic);
+    precon.Setup<MultiGridPrecon>(*this);
   else if(settings.compareSetting("PRECONDITIONER", "SEMFEM"))
-    elliptic->precon = new SEMFEMPrecon(*elliptic);
+    precon.Setup<SEMFEMPrecon>(*this);
   else if(settings.compareSetting("PRECONDITIONER", "OAS"))
-    elliptic->precon = new OASPrecon(*elliptic);
+    precon.Setup<OASPrecon>(*this);
   else if(settings.compareSetting("PRECONDITIONER", "NONE"))
-    elliptic->precon = new IdentityPrecon(elliptic->Ndofs);
-
-  return *elliptic;
-}
-
-elliptic_t::~elliptic_t() {
-  maskKernel.free();
-  partialAxKernel.free();
-  partialGradientKernel.free();
-  partialIpdgKernel.free();
-
-  if (traceHalo) traceHalo->Free();
-  if (ogsMasked) ogsMasked->Free();
-  if (precon) delete precon;
+    precon.Setup<IdentityPrecon>(Ndofs);
 }
diff --git a/solvers/elliptic/src/ellipticSetupNewDegree.cpp b/solvers/elliptic/src/ellipticSetupNewDegree.cpp
index 77c941f1a..163aa42a6 100644
--- a/solvers/elliptic/src/ellipticSetupNewDegree.cpp
+++ b/solvers/elliptic/src/ellipticSetupNewDegree.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,44 +27,31 @@ SOFTWARE.
 #include "elliptic.hpp"
 #include "ellipticPrecon.hpp"
 
-elliptic_t& elliptic_t::SetupNewDegree(mesh_t& meshC){
+elliptic_t elliptic_t::SetupNewDegree(mesh_t& meshC){
 
   //if asking for the same degree, return the original solver
   if (meshC.N == mesh.N) return *this;
 
-  elliptic_t* elliptic = new elliptic_t(platform, meshC, settings, lambda);
-
   //shallow copy
-  elliptic->Nfields = Nfields;
-  elliptic->lambda = lambda;
-
-  elliptic->disc_ipdg = disc_ipdg;
-  elliptic->disc_c0 = disc_c0;
-
-  elliptic->grad = grad;
-  elliptic->o_grad = o_grad;
+  elliptic_t elliptic = *this;
 
-  elliptic->o_AqL = o_AqL;
-
-  elliptic->BCType = BCType;
-
-  elliptic->maskKernel = maskKernel;
+  elliptic.mesh = meshC;
 
   /*setup trace halo exchange */
-  elliptic->traceHalo = meshC.HaloTraceSetup(elliptic->Nfields);
+  elliptic.traceHalo = meshC.HaloTraceSetup(Nfields);
 
   //setup boundary flags and make mask and masked ogs
-  elliptic->BoundarySetup();
+  elliptic.BoundarySetup();
 
   //tau (penalty term in IPDG)
   if (settings.compareSetting("DISCRETIZATION","IPDG")) {
-    if (meshC.elementType==TRIANGLES ||
-        meshC.elementType==QUADRILATERALS){
-      elliptic->tau = 2.0*(meshC.N+1)*(meshC.N+2)/2.0;
+    if (meshC.elementType==Mesh::TRIANGLES ||
+        meshC.elementType==Mesh::QUADRILATERALS){
+      elliptic.tau = 2.0*(meshC.N+1)*(meshC.N+2)/2.0;
       if(meshC.dim==3)
-        elliptic->tau *= 1.5;
+        elliptic.tau *= 1.5;
     } else
-      elliptic->tau = 2.0*(meshC.N+1)*(meshC.N+3);
+      elliptic.tau = 2.0*(meshC.N+1)*(meshC.N+3);
 
     //buffer for gradient (Reuse the original buffer)
     // dlong Ntotal = meshC.Np*(meshC.Nelements+meshC.totalHaloPairs);
@@ -73,78 +60,83 @@ elliptic_t& elliptic_t::SetupNewDegree(mesh_t& meshC){
   }
 
   // OCCA build stuff
-  occa::properties kernelInfo = meshC.props; //copy base occa properties
+  properties_t kernelInfo = meshC.props; //copy base occa properties
 
   // set kernel name suffix
-  char *suffix;
-  if(meshC.elementType==TRIANGLES){
+  std::string suffix;
+  if(meshC.elementType==Mesh::TRIANGLES){
     if(meshC.dim==2)
-      suffix = strdup("Tri2D");
+      suffix = "Tri2D";
     else
-      suffix = strdup("Tri3D");
-  } else if(meshC.elementType==QUADRILATERALS){
+      suffix = "Tri3D";
+  } else if(meshC.elementType==Mesh::QUADRILATERALS){
     if(meshC.dim==2)
-      suffix = strdup("Quad2D");
+      suffix = "Quad2D";
     else
-      suffix = strdup("Quad3D");
-  } else if(meshC.elementType==TETRAHEDRA)
-    suffix = strdup("Tet3D");
-  else if(meshC.elementType==HEXAHEDRA)
-    suffix = strdup("Hex3D");
+      suffix = "Quad3D";
+  } else if(meshC.elementType==Mesh::TETRAHEDRA)
+    suffix = "Tet3D";
+  else if(meshC.elementType==Mesh::HEXAHEDRA)
+    suffix = "Hex3D";
+
+  std::string oklFilePrefix = DELLIPTIC "/okl/";
+  std::string oklFileSuffix = ".okl";
 
-  char fileName[BUFSIZ], kernelName[BUFSIZ];
+  std::string fileName, kernelName;
 
   //add standard boundary functions
-  char *boundaryHeaderFileName;
+  std::string boundaryHeaderFileName;
   if (meshC.dim==2)
-    boundaryHeaderFileName = strdup(DELLIPTIC "/data/ellipticBoundary2D.h");
+    boundaryHeaderFileName = std::string(DELLIPTIC "/data/ellipticBoundary2D.h");
   else if (meshC.dim==3)
-    boundaryHeaderFileName = strdup(DELLIPTIC "/data/ellipticBoundary3D.h");
+    boundaryHeaderFileName = std::string(DELLIPTIC "/data/ellipticBoundary3D.h");
   kernelInfo["includes"] += boundaryHeaderFileName;
 
   int blockMax = 256;
   if (platform.device.mode() == "CUDA") blockMax = 512;
 
-  int NblockV = mymax(1,blockMax/meshC.Np);
+  int NblockV = std::max(1,blockMax/meshC.Np);
   kernelInfo["defines/" "p_NblockV"]= NblockV;
 
   // Ax kernel
   if (settings.compareSetting("DISCRETIZATION","CONTINUOUS")) {
-    sprintf(fileName,  DELLIPTIC "/okl/ellipticAx%s.okl", suffix);
-    if(meshC.elementType==HEXAHEDRA){
+    fileName   = oklFilePrefix + "ellipticAx" + suffix + oklFileSuffix;
+    if(meshC.elementType==Mesh::HEXAHEDRA){
       if(mesh.settings.compareSetting("ELEMENT MAP", "TRILINEAR"))
-        sprintf(kernelName, "ellipticPartialAxTrilinear%s", suffix);
+        kernelName = "ellipticPartialAxTrilinear" + suffix;
       else
-        sprintf(kernelName, "ellipticPartialAx%s", suffix);
+        kernelName = "ellipticPartialAx" + suffix;
     } else{
-      sprintf(kernelName, "ellipticPartialAx%s", suffix);
+      kernelName = "ellipticPartialAx" + suffix;
     }
 
-    elliptic->partialAxKernel = platform.buildKernel(fileName, kernelName,
+    elliptic.partialAxKernel = platform.buildKernel(fileName, kernelName,
                                             kernelInfo);
 
   } else if (settings.compareSetting("DISCRETIZATION","IPDG")) {
-    int Nmax = mymax(meshC.Np, meshC.Nfaces*meshC.Nfp);
+    int Nmax = std::max(meshC.Np, meshC.Nfaces*meshC.Nfp);
     kernelInfo["defines/" "p_Nmax"]= Nmax;
 
-    sprintf(fileName, DELLIPTIC "/okl/ellipticGradient%s.okl", suffix);
-    sprintf(kernelName, "ellipticPartialGradient%s", suffix);
-    elliptic->partialGradientKernel = platform.buildKernel(fileName, kernelName,
+    fileName   = oklFilePrefix + "ellipticGradient" + suffix + oklFileSuffix;
+    kernelName = "ellipticPartialGradient" + suffix;
+    elliptic.partialGradientKernel = platform.buildKernel(fileName, kernelName,
                                                   kernelInfo);
 
-    sprintf(fileName, DELLIPTIC "/okl/ellipticAxIpdg%s.okl", suffix);
-    sprintf(kernelName, "ellipticPartialAxIpdg%s", suffix);
-    elliptic->partialIpdgKernel = platform.buildKernel(fileName, kernelName,
+    fileName   = oklFilePrefix + "ellipticAxIpdg" + suffix + oklFileSuffix;
+    kernelName = "ellipticPartialAxIpdg" + suffix;
+    elliptic.partialIpdgKernel = platform.buildKernel(fileName, kernelName,
                                               kernelInfo);
   }
 
   if (settings.compareSetting("DISCRETIZATION", "CONTINUOUS")) {
-    elliptic->Ndofs = elliptic->ogsMasked->Ngather*elliptic->Nfields;
+    elliptic.Ndofs = elliptic.ogsMasked.Ngather*Nfields;
+    elliptic.Nhalo = elliptic.gHalo.Nhalo*Nfields;
   } else {
-    elliptic->Ndofs = meshC.Nelements*meshC.Np*elliptic->Nfields;
+    elliptic.Ndofs = meshC.Nelements*meshC.Np*Nfields;
+    elliptic.Nhalo = meshC.totalHaloPairs*meshC.Np*Nfields;
   }
 
-  elliptic->precon = NULL;
+  elliptic.precon = precon_t();
 
-  return *elliptic;
+  return elliptic;
 }
diff --git a/solvers/elliptic/src/ellipticSetupRingPatch.cpp b/solvers/elliptic/src/ellipticSetupRingPatch.cpp
index 92f8b0281..7a3a4bd7b 100644
--- a/solvers/elliptic/src/ellipticSetupRingPatch.cpp
+++ b/solvers/elliptic/src/ellipticSetupRingPatch.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,59 +27,41 @@ SOFTWARE.
 #include "elliptic.hpp"
 #include "ellipticPrecon.hpp"
 
-elliptic_t* elliptic_t::SetupRingPatch(mesh_t& meshPatch){
+elliptic_t elliptic_t::SetupRingPatch(mesh_t& meshPatch){
 
   //just reuse the current solver if there are no neighbors
-  if (mesh.size == 1) return this;
-
-  elliptic_t* elliptic = new elliptic_t(platform, meshPatch, settings, lambda);
+  if (mesh.size == 1) return *this;
 
   //shallow copy
-  elliptic->Nfields = Nfields;
-  elliptic->lambda = lambda;
+  elliptic_t elliptic = *this;
 
-  elliptic->disc_ipdg = disc_ipdg;
-  elliptic->disc_c0 = disc_c0;
+  elliptic.mesh = meshPatch;
+  elliptic.comm = meshPatch.comm;
 
   //buffer for gradient
   if (settings.compareSetting("DISCRETIZATION","IPDG")) {
     dlong Ntotal = meshPatch.Np*meshPatch.Nelements;
-    elliptic->grad = (dfloat*) calloc(Ntotal*4, sizeof(dfloat));
-    elliptic->o_grad  = platform.malloc(Ntotal*4*sizeof(dfloat), elliptic->grad);
+    elliptic.grad.malloc(Ntotal*4, 0.0);
+    elliptic.o_grad = platform.malloc<dfloat>(elliptic.grad);
   } else {
     //buffer for local Ax
     dlong Ntotal = meshPatch.Np*meshPatch.Nelements;
-    elliptic->o_AqL  = platform.malloc(Ntotal*sizeof(dfloat));
+    elliptic.o_AqL = platform.malloc<dfloat>(Ntotal);
   }
-  //tau (penalty term in IPDG)
-  elliptic->tau = tau;
 
   /*setup trace halo exchange */
-  elliptic->traceHalo = meshPatch.HaloTraceSetup(elliptic->Nfields);
-
-  elliptic->BCType = BCType;
-
-  elliptic->maskKernel = maskKernel;
+  elliptic.traceHalo = meshPatch.HaloTraceSetup(Nfields);
 
   //setup boundary flags and make mask and masked ogs
-  elliptic->BoundarySetup();
-
-
-  // Ax kernel
-  if (settings.compareSetting("DISCRETIZATION","CONTINUOUS")) {
-    elliptic->partialAxKernel = partialAxKernel;
-  } else if (settings.compareSetting("DISCRETIZATION","IPDG")) {
-    elliptic->partialGradientKernel = partialGradientKernel;
-    elliptic->partialIpdgKernel = partialIpdgKernel;
-  }
+  elliptic.BoundarySetup();
 
   if (settings.compareSetting("DISCRETIZATION", "CONTINUOUS")) {
-    elliptic->Ndofs = elliptic->ogsMasked->Ngather*elliptic->Nfields;
+    elliptic.Ndofs = elliptic.ogsMasked.Ngather*Nfields;
   } else {
-    elliptic->Ndofs = meshPatch.Nelements*meshPatch.Np*elliptic->Nfields;
+    elliptic.Ndofs = meshPatch.Nelements*meshPatch.Np*Nfields;
   }
 
-  elliptic->precon = NULL;
+  elliptic.precon = precon_t();
 
   return elliptic;
 }
diff --git a/solvers/elliptic/src/ellipticSolve.cpp b/solvers/elliptic/src/ellipticSolve.cpp
index e6e385edb..be75efa3a 100644
--- a/solvers/elliptic/src/ellipticSolve.cpp
+++ b/solvers/elliptic/src/ellipticSolve.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,13 +27,13 @@ SOFTWARE.
 #include "elliptic.hpp"
 
 int elliptic_t::Solve(linearSolver_t& linearSolver,
-                      occa::memory &o_x, occa::memory &o_r,
+                      deviceMemory<dfloat> &o_x, deviceMemory<dfloat> &o_r,
                       const dfloat tol, const int MAXIT, const int verbose){
 
   // if there is a nullspace, remove the constant vector from r
   if(allNeumann) ZeroMean(o_r);
 
-  int Niter = linearSolver.Solve(*this, *precon, o_x, o_r, tol, MAXIT, verbose);
+  int Niter = linearSolver.Solve(*this, precon, o_x, o_r, tol, MAXIT, verbose);
 
   return Niter;
 }
diff --git a/solvers/elliptic/src/ellipticZeroMean.cpp b/solvers/elliptic/src/ellipticZeroMean.cpp
index 1a5560486..3233bd566 100644
--- a/solvers/elliptic/src/ellipticZeroMean.cpp
+++ b/solvers/elliptic/src/ellipticZeroMean.cpp
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -26,13 +26,12 @@
 
 #include "elliptic.hpp"
 
-void elliptic_t::ZeroMean(occa::memory &o_q){
+void elliptic_t::ZeroMean(deviceMemory<dfloat> &o_q){
 
-  dfloat qmean = linAlg.sum(Ndofs, o_q, mesh.comm);
+  dfloat qmean = platform.linAlg().sum(Ndofs, o_q, mesh.comm);
 
   // normalize
   qmean *= allNeumannScale*allNeumannScale;
-
   // q[n] = q[n] - qmean
-  platform.linAlg.add(Ndofs, -qmean, o_q);
+  platform.linAlg().add(Ndofs, -qmean, o_q);
 }
diff --git a/solvers/fokkerPlanck/data/fpeLinear2D.h b/solvers/fokkerPlanck/data/fpeLinear2D.h
index 014feb0fa..b73fcefbc 100644
--- a/solvers/fokkerPlanck/data/fpeLinear2D.h
+++ b/solvers/fokkerPlanck/data/fpeLinear2D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -66,4 +66,4 @@ SOFTWARE.
     *(qxB) = 0.0;                      \
     *(qyB) = 0.0;                      \
   }                                    \
-}
\ No newline at end of file
+}
diff --git a/solvers/fokkerPlanck/data/fpeLinear3D.h b/solvers/fokkerPlanck/data/fpeLinear3D.h
index 4d36c316a..a00f02095 100644
--- a/solvers/fokkerPlanck/data/fpeLinear3D.h
+++ b/solvers/fokkerPlanck/data/fpeLinear3D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -72,4 +72,4 @@ SOFTWARE.
     *(qyB) = 0.0;                      \
     *(qzB) = 0.0;                      \
   }                                    \
-}
\ No newline at end of file
+}
diff --git a/solvers/fokkerPlanck/fpe.hpp b/solvers/fokkerPlanck/fpe.hpp
index 912730587..c0e360fa9 100644
--- a/solvers/fokkerPlanck/fpe.hpp
+++ b/solvers/fokkerPlanck/fpe.hpp
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -37,48 +37,47 @@
 
 #define DFPE LIBP_DIR"/solvers/fokkerPlanck/"
 
+using namespace libp;
+
 class fpeSettings_t: public settings_t {
 public:
-  fpeSettings_t(MPI_Comm& _comm);
+  fpeSettings_t(comm_t& _comm);
   void report();
   void parseFromFile(platformSettings_t& platformSettings,
                      meshSettings_t& meshSettings,
-                     const string filename);
+                     const std::string filename);
 
-  ellipticSettings_t* extractEllipticSettings();
+  ellipticSettings_t extractEllipticSettings();
 };
 
 class fpe_t;
 
 class subcycler_t: public solver_t {
 public:
-  mesh_t& mesh;
+  mesh_t mesh;
 
   int cubature;
-  halo_t* traceHalo;
-  occa::kernel advectionVolumeKernel;
-  occa::kernel advectionSurfaceKernel;
-
-  subcycler_t() = delete;
-  subcycler_t(fpe_t& fpe);
+  ogs::halo_t traceHalo;
+  kernel_t advectionVolumeKernel;
+  kernel_t advectionSurfaceKernel;
 
-  ~subcycler_t(){};
+  subcycler_t() = default;
 
   void Report(dfloat time, int tstep){};
 
-  void rhsf(occa::memory& o_q, occa::memory& o_rhs, const dfloat time);
+  void rhsf(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_rhs, const dfloat time);
 };
 
 class fpe_t: public solver_t {
 public:
-  mesh_t& mesh;
-  TimeStepper::timeStepper_t* timeStepper;
+  mesh_t mesh;
+  timeStepper_t timeStepper;
 
-  halo_t* traceHalo;
+  ogs::halo_t traceHalo;
 
-  ellipticSettings_t *ellipticSettings;
-  elliptic_t *elliptic;
-  linearSolver_t *linearSolver;
+  ellipticSettings_t ellipticSettings;
+  elliptic_t elliptic;
+  linearSolver_t linearSolver;
 
   int Nfields;
 
@@ -87,60 +86,59 @@ class fpe_t: public solver_t {
   dfloat mu;
   dfloat tau;
 
-  dfloat *q;
-  occa::memory o_q;
+  memory<dfloat> q;
+  deviceMemory<dfloat> o_q;
 
-  occa::memory o_Mq;
+  deviceMemory<dfloat> o_Mq;
 
-  dfloat *grad;
-  occa::memory o_grad;
+  memory<dfloat> grad;
+  deviceMemory<dfloat> o_grad;
 
   //subcycling
   int Nsubcycles;
-  TimeStepper::timeStepper_t* subStepper;
-  subcycler_t *subcycler;
+  timeStepper_t subStepper;
+  subcycler_t subcycler;
 
-  occa::kernel advectionVolumeKernel;
-  occa::kernel advectionSurfaceKernel;
-  occa::kernel gradientKernel;
-  occa::kernel diffusionKernel;
-  occa::kernel diffusionRhsKernel;
+  kernel_t advectionVolumeKernel;
+  kernel_t advectionSurfaceKernel;
+  kernel_t gradientKernel;
+  kernel_t diffusionKernel;
+  kernel_t diffusionRhsKernel;
 
-  occa::kernel initialConditionKernel;
-  occa::kernel maxWaveSpeedKernel;
+  kernel_t initialConditionKernel;
+  kernel_t maxWaveSpeedKernel;
 
-  fpe_t() = delete;
+  fpe_t() = default;
   fpe_t(platform_t &_platform, mesh_t &_mesh,
-        settings_t& _settings):
-    solver_t(_platform, _settings), mesh(_mesh) {}
-
-  ~fpe_t();
+        fpeSettings_t& _settings) {
+    Setup(_platform, _mesh, _settings);
+  }
 
   //setup
-  static fpe_t& Setup(platform_t& platform, mesh_t& mesh,
-                      fpeSettings_t& settings);
+  void Setup(platform_t& _platform, mesh_t& _mesh,
+             fpeSettings_t& _settings);
 
   void Run();
 
   void Report(dfloat time, int tstep);
 
-  void PlotFields(dfloat* Q, char *fileName);
+  void PlotFields(memory<dfloat>& Q, std::string fileName);
 
-  dfloat MaxWaveSpeed(occa::memory& o_Q, const dfloat T);
+  dfloat MaxWaveSpeed(deviceMemory<dfloat>& o_Q, const dfloat T);
 
-  void rhsf(occa::memory& o_q, occa::memory& o_rhs, const dfloat time);
+  void rhsf(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_rhs, const dfloat time);
 
-  void rhs_imex_f(occa::memory& o_q, occa::memory& o_rhs, const dfloat time);
-  void rhs_imex_g(occa::memory& o_q, occa::memory& o_rhs, const dfloat time);
+  void rhs_imex_f(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_rhs, const dfloat time);
+  void rhs_imex_g(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_rhs, const dfloat time);
 
-  void rhs_imex_invg(occa::memory& o_q, occa::memory& o_rhs, const dfloat gamma, const dfloat time);
+  void rhs_imex_invg(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_rhs, const dfloat gamma, const dfloat time);
 
-  void rhs_subcycle_f(occa::memory& o_Q, occa::memory& o_QHAT,
-                           const dfloat T, const dfloat dt, const dfloat* B,
-                           const int order, const int shiftIndex, const int maxOrder);
+  void rhs_subcycle_f(deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_QHAT,
+                      const dfloat T, const dfloat dt, const memory<dfloat> B,
+                      const int order, const int shiftIndex, const int maxOrder);
 
-  void Advection(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T);
-  void Diffusion(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T);
+  void Advection(deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS, const dfloat T);
+  void Diffusion(deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS, const dfloat T);
 };
 
 #endif
diff --git a/solvers/fokkerPlanck/fpeMain.cpp b/solvers/fokkerPlanck/fpeMain.cpp
index abf5f9d83..3eaa8a6e0 100644
--- a/solvers/fokkerPlanck/fpeMain.cpp
+++ b/solvers/fokkerPlanck/fpeMain.cpp
@@ -1,7 +1,7 @@
 /*
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -28,40 +28,41 @@ SOFTWARE.
 int main(int argc, char **argv){
 
   // start up MPI
-  MPI_Init(&argc, &argv);
+  Comm::Init(argc, argv);
 
-  MPI_Comm comm = MPI_COMM_WORLD;
+  LIBP_ABORT("Usage: ./fpeMain setupfile", argc!=2);
 
-  if(argc!=2)
-    LIBP_ABORT(string("Usage: ./fpeMain setupfile"));
+  { /*Scope so everything is destructed before MPI_Finalize */
+    comm_t comm(Comm::World().Dup());
 
-  //create default settings
-  platformSettings_t platformSettings(comm);
-  meshSettings_t meshSettings(comm);
-  fpeSettings_t fpeSettings(comm);
+    //create default settings
+    platformSettings_t platformSettings(comm);
+    meshSettings_t meshSettings(comm);
+    fpeSettings_t fpeSettings(comm);
 
-  //load settings from file
-  fpeSettings.parseFromFile(platformSettings, meshSettings,
-                            argv[1]);
+    //load settings from file
+    fpeSettings.parseFromFile(platformSettings, meshSettings,
+                              argv[1]);
 
-  // set up platform
-  platform_t platform(platformSettings);
+    // set up platform
+    platform_t platform(platformSettings);
 
-  platformSettings.report();
-  meshSettings.report();
-  fpeSettings.report();
+    platformSettings.report();
+    meshSettings.report();
+    fpeSettings.report();
 
-  // set up mesh
-  mesh_t& mesh = mesh_t::Setup(platform, meshSettings, comm);
+    // set up mesh
+    mesh_t mesh(platform, meshSettings, comm);
 
-  // set up fpe solver
-  fpe_t& fpe = fpe_t::Setup(platform, mesh, fpeSettings);
+    // set up fpe solver
+    fpe_t fpe(platform, mesh, fpeSettings);
 
-  // run
-  fpe.Run();
+    // run
+    fpe.Run();
+  }
 
   // close down MPI
-  MPI_Finalize();
+  Comm::Finalize();
   return LIBP_SUCCESS;
 }
 
diff --git a/solvers/fokkerPlanck/makefile b/solvers/fokkerPlanck/makefile
index 1a11fa09a..340800e29 100644
--- a/solvers/fokkerPlanck/makefile
+++ b/solvers/fokkerPlanck/makefile
@@ -2,7 +2,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
@@ -78,29 +78,25 @@ include ../../make.top
 endif
 endif
 
-#gslib
-GS_DIR=${LIBP_TPL_DIR}/gslib
-
 #libraries
 ELLIPTIC_DIR =${LIBP_DIR}/solvers/elliptic
-FPE_LIBP_LIBS=timeStepper linearSolver parAlmond mesh ogs linAlg core
+FPE_LIBP_LIBS=timeStepper linearSolver parAlmond mesh parAdogs ogs linAlg core
 
 #includes
 INCLUDES=-I${ELLIPTIC_DIR} \
-					${LIBP_INCLUDES} \
-				 -I.
+			${LIBP_INCLUDES} \
+			-I.
 
 #defines
 DEFINES =${LIBP_DEFINES} \
          -DLIBP_DIR='"${LIBP_DIR}"'
 
 #.cpp compilation flags
-FPE_CXXFLAGS=${LIBP_MPICXXFLAGS} ${DEFINES} ${INCLUDES}
+FPE_CXXFLAGS=${LIBP_CXXFLAGS} ${DEFINES} ${INCLUDES}
 
 #link libraries
 LIBS=-L${ELLIPTIC_DIR} -lelliptic \
-		 -L${LIBP_LIBS_DIR} $(addprefix -l,$(FPE_LIBP_LIBS)) \
-     -L$(GS_DIR)/lib -lgs \
+	  -L${LIBP_LIBS_DIR} $(addprefix -l,$(FPE_LIBP_LIBS)) \
      ${LIBP_LIBS}
 
 #link flags
@@ -156,10 +152,10 @@ endif
 # rule for .cpp files
 %.o: %.cpp $(DEPS) | libelliptic
 ifneq (,${verbose})
-	$(LIBP_MPICXX) -o $*.o -c $*.cpp $(FPE_CXXFLAGS)
+	$(LIBP_CXX) -o $*.o -c $*.cpp $(FPE_CXXFLAGS)
 else
 	@printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n";
-	@$(LIBP_MPICXX) -o $*.o -c $*.cpp $(FPE_CXXFLAGS)
+	@$(LIBP_CXX) -o $*.o -c $*.cpp $(FPE_CXXFLAGS)
 endif
 
 #cleanup
@@ -171,8 +167,7 @@ clean-libs: clean
 	${MAKE} -C ${LIBP_LIBS_DIR} clean
 
 clean-kernels: clean-libs
-# 	$(shell ${OCCA_DIR}/bin/occa clear all -y)
-	rm -rf ~/.occa/
+	rm -rf ${LIBP_DIR}/.occa/
 
 realclean: clean
 	${MAKE} -C ${ELLIPTIC_DIR} clean
diff --git a/solvers/fokkerPlanck/okl/fpeAdvectionHex3D.okl b/solvers/fokkerPlanck/okl/fpeAdvectionHex3D.okl
index ee60ada81..671c6a0b1 100644
--- a/solvers/fokkerPlanck/okl/fpeAdvectionHex3D.okl
+++ b/solvers/fokkerPlanck/okl/fpeAdvectionHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -58,8 +58,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
-
     #pragma unroll p_Nq
     for(int k=0;k<p_Nq;++k){
 
@@ -90,8 +88,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
-
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
           #pragma unroll p_Nq
@@ -105,8 +101,7 @@ SOFTWARE.
           }
         }
       }
-      @barrier("local");
-    } //k loop
+    }
 
     //write out
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -205,8 +200,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
-
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -217,8 +210,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
-
     // face 1 & 3
     for(int k=0;k<p_Nq;++k;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -229,8 +220,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
-
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -249,8 +238,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
-
     // face 2 & 4
     for(int k=0;k<p_Nq;++k;@inner(1)){
       for(int j=0;j<p_Nq;++j;@inner(0)){
@@ -261,8 +248,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
-
     // face 2 & 4
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -281,8 +266,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
-
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
         #pragma unroll p_Nq
diff --git a/solvers/fokkerPlanck/okl/fpeAdvectionQuad2D.okl b/solvers/fokkerPlanck/okl/fpeAdvectionQuad2D.okl
index 2eec6dc7e..41cc3148a 100644
--- a/solvers/fokkerPlanck/okl/fpeAdvectionQuad2D.okl
+++ b/solvers/fokkerPlanck/okl/fpeAdvectionQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -71,7 +71,6 @@ SOFTWARE.
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -181,7 +180,6 @@ void surfaceTerms(dlong e, int es,
       }
     }
 
-    @barrier("local");
 
     // for all face nodes of all elements
     // face 0 & 2
@@ -203,7 +201,6 @@ void surfaceTerms(dlong e, int es,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -224,7 +221,6 @@ void surfaceTerms(dlong e, int es,
       }
     }
 
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
diff --git a/solvers/fokkerPlanck/okl/fpeAdvectionTet3D.okl b/solvers/fokkerPlanck/okl/fpeAdvectionTet3D.okl
index eb8af7493..884e46f91 100644
--- a/solvers/fokkerPlanck/okl/fpeAdvectionTet3D.okl
+++ b/solvers/fokkerPlanck/okl/fpeAdvectionTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -72,7 +72,6 @@ SOFTWARE.
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
@@ -180,7 +179,6 @@ SOFTWARE.
 
 
     // wait for all flux functions are written to @shared
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
diff --git a/solvers/fokkerPlanck/okl/fpeAdvectionTri2D.okl b/solvers/fokkerPlanck/okl/fpeAdvectionTri2D.okl
index 7071b2950..cbe4b6a1b 100644
--- a/solvers/fokkerPlanck/okl/fpeAdvectionTri2D.okl
+++ b/solvers/fokkerPlanck/okl/fpeAdvectionTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -64,7 +64,6 @@ SOFTWARE.
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
@@ -172,7 +171,6 @@ SOFTWARE.
     }
 
     // wait for all flux functions are written to @shared
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
diff --git a/solvers/fokkerPlanck/okl/fpeCubatureAdvectionHex3D.okl b/solvers/fokkerPlanck/okl/fpeCubatureAdvectionHex3D.okl
index 82ea4c0e7..f3dcb8cbf 100644
--- a/solvers/fokkerPlanck/okl/fpeCubatureAdvectionHex3D.okl
+++ b/solvers/fokkerPlanck/okl/fpeCubatureAdvectionHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -74,7 +74,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //read in and interpolate in k
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -95,7 +94,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     #pragma unroll p_cubNq
     for(int k=0;k<p_cubNq;++k){
@@ -108,7 +106,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -123,7 +120,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -133,7 +129,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -146,7 +141,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
     }
 
     //#pragma unroll p_cubNq
@@ -179,7 +173,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -194,7 +187,6 @@ SOFTWARE.
           }
         }
       }
-      @barrier("local");
     } //k loop
 
     #pragma unroll p_cubNq
@@ -206,7 +198,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -221,7 +212,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -231,7 +221,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -246,7 +235,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
     }
 
     //project in k and write out
@@ -293,7 +281,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -308,7 +295,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -319,7 +305,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -332,7 +317,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -373,7 +357,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -387,7 +370,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -396,7 +378,6 @@ SOFTWARE.
         }                                                               \
       }                                                                 \
     }                                                                   \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -410,7 +391,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -474,11 +454,9 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     surfaceTerms(0) //face 0
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -489,11 +467,9 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     surfaceTerms(5) //face 5
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -504,11 +480,9 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     surfaceTerms(1) //face 1
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -522,11 +496,9 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     surfaceTerms(3) //face 3
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -540,11 +512,9 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     surfaceTerms(2) //face 2
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -558,11 +528,9 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     surfaceTerms(4) //face 4
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -576,7 +544,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_cubNq;++j;@inner(1)){
       for(int i=0;i<p_cubNq;++i;@inner(0)){
diff --git a/solvers/fokkerPlanck/okl/fpeCubatureAdvectionQuad2D.okl b/solvers/fokkerPlanck/okl/fpeCubatureAdvectionQuad2D.okl
index c23ca1773..318419f04 100644
--- a/solvers/fokkerPlanck/okl/fpeCubatureAdvectionQuad2D.okl
+++ b/solvers/fokkerPlanck/okl/fpeCubatureAdvectionQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -73,7 +73,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //interpolate in i, store in register
     for(int es=0;es<p_cubNblockV;++es;@inner(2)){
@@ -92,7 +91,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int es=0;es<p_cubNblockV;++es;@inner(2)){
@@ -105,7 +103,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //interpolate in j and store flux in register
     for(int es=0;es<p_cubNblockV;++es;@inner(2)){
@@ -139,7 +136,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //project/differentiate in j
     for(int es=0;es<p_cubNblockV;++es;@inner(2)){
@@ -160,7 +156,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int es=0;es<p_cubNblockV;++es;@inner(2)){
@@ -174,7 +169,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //project/differentiate in i and write back
     for(int es=0;es<p_cubNblockV;++es;@inner(2)){
@@ -296,7 +290,6 @@ void surfaceTerms(int es, int face, int m, int i, int j,
       }
     }
 
-    @barrier("local");
 
     //interpolate traces, store flux in register
     for(int es=0;es<p_cubNblockS;++es;@inner(1)){
@@ -319,7 +312,6 @@ void surfaceTerms(int es, int face, int m, int i, int j,
       }
     }
 
-    @barrier("local");
 
     //write fluxes to @shared
     for(int es=0;es<p_cubNblockS;++es;@inner(1)){
@@ -369,7 +361,6 @@ void surfaceTerms(int es, int face, int m, int i, int j,
       }
     }
 
-    @barrier("local");
 
     // for all face nodes of all elements
     // face 0 & 2
@@ -387,7 +378,6 @@ void surfaceTerms(int es, int face, int m, int i, int j,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int es=0;es<p_cubNblockS;++es;@inner(1)){
@@ -404,7 +394,6 @@ void surfaceTerms(int es, int face, int m, int i, int j,
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_cubNblockS;++es;@inner(1)){
       for(int i=0;i<p_cubNq;++i;@inner(0)){
diff --git a/solvers/fokkerPlanck/okl/fpeCubatureAdvectionTet3D.okl b/solvers/fokkerPlanck/okl/fpeCubatureAdvectionTet3D.okl
index 278b6881d..ef7e9cdda 100644
--- a/solvers/fokkerPlanck/okl/fpeCubatureAdvectionTet3D.okl
+++ b/solvers/fokkerPlanck/okl/fpeCubatureAdvectionTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -61,7 +61,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_cubNblockV;++es;@inner(1)){// for all elements in block
       for(int n=0;n<p_cubNp;++n;@inner(0)){     // for all nodes in this element
@@ -101,7 +100,6 @@ SOFTWARE.
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_cubNblockV;++es;@inner(1)){// for all elements in block
       for(int n=0;n<p_cubNp;++n;@inner(0)){     // for all nodes in this element
@@ -169,7 +167,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // interpolate to surface integration nodes
     for(int es=0;es<p_cubNblockS;++es;@inner(1)){
@@ -234,7 +231,6 @@ SOFTWARE.
     }
 
     // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
 
     // lift from surface integration to volume nodes
     for(int es=0;es<p_cubNblockS;++es;@inner(1)){
diff --git a/solvers/fokkerPlanck/okl/fpeCubatureAdvectionTri2D.okl b/solvers/fokkerPlanck/okl/fpeCubatureAdvectionTri2D.okl
index 34d9cb424..66a185e2b 100644
--- a/solvers/fokkerPlanck/okl/fpeCubatureAdvectionTri2D.okl
+++ b/solvers/fokkerPlanck/okl/fpeCubatureAdvectionTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -59,7 +59,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_cubNblockV;++es;@inner(1)){// for all elements in block
       for(int n=0;n<p_cubNp;++n;@inner(0)){     // for all nodes in this element
@@ -81,7 +80,6 @@ SOFTWARE.
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_cubNblockV;++es;@inner(1)){// for all elements in block
       for(int n=0;n<p_cubNp;++n;@inner(0)){     // for all nodes in this element
@@ -160,7 +158,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // interpolate to surface integration nodes
     for(int es=0;es<p_cubNblockS;++es;@inner(1)){
@@ -224,7 +221,6 @@ SOFTWARE.
     }
 
     // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
 
     // lift from surface integration to volume nodes
     for(int es=0;es<p_cubNblockS;++es;@inner(1)){
diff --git a/solvers/fokkerPlanck/okl/fpeDiffusionHex3D.okl b/solvers/fokkerPlanck/okl/fpeDiffusionHex3D.okl
index 419f3970e..a246d95e4 100644
--- a/solvers/fokkerPlanck/okl/fpeDiffusionHex3D.okl
+++ b/solvers/fokkerPlanck/okl/fpeDiffusionHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -127,7 +127,6 @@ void surfaceTerms(const int emap,
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -148,7 +147,6 @@ void surfaceTerms(const int emap,
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -167,7 +165,6 @@ void surfaceTerms(const int emap,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -188,7 +185,6 @@ void surfaceTerms(const int emap,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -214,7 +210,6 @@ void surfaceTerms(const int emap,
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -235,7 +230,6 @@ void surfaceTerms(const int emap,
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -261,7 +255,6 @@ void surfaceTerms(const int emap,
       }
     }
 
-    @barrier("local");
 
     //layer by layer
     #pragma unroll p_Nq
@@ -293,7 +286,6 @@ void surfaceTerms(const int emap,
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -310,7 +302,6 @@ void surfaceTerms(const int emap,
         }
       }
 
-      @barrier("local");
     }
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
diff --git a/solvers/fokkerPlanck/okl/fpeDiffusionQuad2D.okl b/solvers/fokkerPlanck/okl/fpeDiffusionQuad2D.okl
index c6babd8de..5c409ec18 100644
--- a/solvers/fokkerPlanck/okl/fpeDiffusionQuad2D.okl
+++ b/solvers/fokkerPlanck/okl/fpeDiffusionQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -113,8 +113,6 @@ void surfaceTerms(const int element,
       }
     }
 
-    @barrier("local");
-
     // loop over faces to add pseudo-gradient
 
     // face 0 & 2
@@ -129,8 +127,6 @@ void surfaceTerms(const int element,
       surfaceTerms(element, sk2, 2, i, p_Nq-1, tau, sgeo, vmapM, vmapP, EToB, gradq, s_dqdx, s_dqdy, s_rhsq);
     }
 
-    @barrier("local");
-
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(0)){
       dlong sk1 = element*p_Nfp*p_Nfaces + 1*p_Nfp + j;
@@ -143,8 +139,6 @@ void surfaceTerms(const int element,
       surfaceTerms(element, sk3, 3, 0, j, tau, sgeo, vmapM, vmapP, EToB, gradq, s_dqdx, s_dqdy, s_rhsq);
     }
 
-    @barrier("local");
-
     // prescale by geofacs
     for(int j=0;j<p_Nq;++j){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -165,8 +159,6 @@ void surfaceTerms(const int element,
       }
     }
 
-    @barrier("local");
-
     // loop over slabs
     for(int j=0;j<p_Nq;++j){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -186,4 +178,4 @@ void surfaceTerms(const int element,
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/fokkerPlanck/okl/fpeDiffusionRhsHex3D.okl b/solvers/fokkerPlanck/okl/fpeDiffusionRhsHex3D.okl
index 661959179..670ae0e82 100644
--- a/solvers/fokkerPlanck/okl/fpeDiffusionRhsHex3D.okl
+++ b/solvers/fokkerPlanck/okl/fpeDiffusionRhsHex3D.okl
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -92,7 +92,6 @@
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -102,7 +101,6 @@
       }
     }
 
-    @barrier("local");
 
     // face 0
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -115,7 +113,6 @@
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -125,7 +122,6 @@
       }
     }
 
-    @barrier("local");
 
     // face 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -138,7 +134,6 @@
       }
     }
 
-    @barrier("local");
 
     // face 1
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -148,7 +143,6 @@
       }
     }
 
-    @barrier("local");
 
     // face 1
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -165,7 +159,6 @@
       }
     }
 
-    @barrier("local");
 
     // face 3
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -175,7 +168,6 @@
       }
     }
 
-    @barrier("local");
 
     // face 3
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -192,7 +184,6 @@
       }
     }
 
-    @barrier("local");
 
     // face 2
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -202,7 +193,6 @@
       }
     }
 
-    @barrier("local");
 
     // face 2
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -219,7 +209,6 @@
       }
     }
 
-    @barrier("local");
 
     // face 4
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -229,7 +218,6 @@
       }
     }
 
-    @barrier("local");
 
     // face 4
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -246,7 +234,6 @@
       }
     }
 
-    @barrier("local");
 
     //layer by layer
     #pragma unroll p_Nq
@@ -278,7 +265,6 @@
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -297,7 +283,6 @@
         }
       }
 
-      @barrier("local");
     }
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -391,7 +376,6 @@
       }
     }
 
-    @barrier("local");
 
     // face 0
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -404,7 +388,6 @@
       }
     }
 
-    @barrier("local");
 
     // face 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -414,7 +397,6 @@
       }
     }
 
-    @barrier("local");
 
     // face 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -427,7 +409,6 @@
       }
     }
 
-    @barrier("local");
 
     // face 1
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -437,7 +418,6 @@
       }
     }
 
-    @barrier("local");
 
     // face 1
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -454,7 +434,6 @@
       }
     }
 
-    @barrier("local");
 
     // face 3
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -464,7 +443,6 @@
       }
     }
 
-    @barrier("local");
 
     // face 3
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -481,7 +459,6 @@
       }
     }
 
-    @barrier("local");
 
     // face 2
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -491,7 +468,6 @@
       }
     }
 
-    @barrier("local");
 
     // face 2
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -508,7 +484,6 @@
       }
     }
 
-    @barrier("local");
 
     // face 4
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -518,7 +493,6 @@
       }
     }
 
-    @barrier("local");
 
     // face 4
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -535,7 +509,6 @@
       }
     }
 
-    @barrier("local");
 
     // Layer by layer
 #pragma unroll p_Nq
@@ -558,7 +531,6 @@
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -573,7 +545,6 @@
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -596,7 +567,6 @@
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
diff --git a/solvers/fokkerPlanck/okl/fpeDiffusionRhsQuad2D.okl b/solvers/fokkerPlanck/okl/fpeDiffusionRhsQuad2D.okl
index ee7b4ae4c..4a71104c7 100644
--- a/solvers/fokkerPlanck/okl/fpeDiffusionRhsQuad2D.okl
+++ b/solvers/fokkerPlanck/okl/fpeDiffusionRhsQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -82,7 +82,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // loop over faces to add pseudo-gradient
 
@@ -95,7 +94,6 @@ SOFTWARE.
       surfaceTerms(sk2,2,i,p_Nq-1);
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(0)){
@@ -106,7 +104,6 @@ SOFTWARE.
       surfaceTerms(sk3,3,0     ,j);
     }
 
-    @barrier("local");
 
     // prescale by geofacs
     for(int j=0;j<p_Nq;++j){
@@ -127,7 +124,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // loop over slabs
     for(int j=0;j<p_Nq;++j){
@@ -209,7 +205,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 0 & 2
     for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -220,7 +215,6 @@ SOFTWARE.
       surfaceTerms(sk2,2,i,p_Nq-1);
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(0)){
@@ -231,7 +225,6 @@ SOFTWARE.
       surfaceTerms(sk3,3,0     ,j);
     }
 
-    @barrier("local");
 
     // loop over slabs
     for(int j=0;j<p_Nq;++j){
@@ -258,7 +251,6 @@ SOFTWARE.
     }
 
     // r term ----->
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -266,7 +258,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -281,7 +272,6 @@ SOFTWARE.
     }
 
     // s term ---->
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -289,7 +279,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j){
       for(int i=0;i<p_Nq;++i;@inner(0)){
diff --git a/solvers/fokkerPlanck/okl/fpeDiffusionRhsTet3D.okl b/solvers/fokkerPlanck/okl/fpeDiffusionRhsTet3D.okl
index 7814ccf14..26df04e77 100644
--- a/solvers/fokkerPlanck/okl/fpeDiffusionRhsTet3D.okl
+++ b/solvers/fokkerPlanck/okl/fpeDiffusionRhsTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -94,7 +94,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // dqdx += LIFT*(sJ/J)*nx*dq
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
@@ -139,7 +138,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_NfacesNfp){
@@ -166,7 +164,6 @@ SOFTWARE.
 
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
@@ -182,7 +179,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
@@ -234,7 +230,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_NfacesNfp){
@@ -264,7 +259,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
diff --git a/solvers/fokkerPlanck/okl/fpeDiffusionRhsTri2D.okl b/solvers/fokkerPlanck/okl/fpeDiffusionRhsTri2D.okl
index f48773944..997a89ad0 100644
--- a/solvers/fokkerPlanck/okl/fpeDiffusionRhsTri2D.okl
+++ b/solvers/fokkerPlanck/okl/fpeDiffusionRhsTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -85,7 +85,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // dqdx += LIFT*(sJ/J)*nx*dq
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
@@ -116,7 +115,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_NfacesNfp){
@@ -141,7 +139,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
@@ -156,7 +153,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
@@ -207,7 +203,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_NfacesNfp){
@@ -236,7 +231,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
diff --git a/solvers/fokkerPlanck/okl/fpeDiffusionTet3D.okl b/solvers/fokkerPlanck/okl/fpeDiffusionTet3D.okl
index c25caec96..b046eeaaa 100644
--- a/solvers/fokkerPlanck/okl/fpeDiffusionTet3D.okl
+++ b/solvers/fokkerPlanck/okl/fpeDiffusionTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -109,7 +109,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // dqdx += LIFT*(sJ/J)*nx*dq
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
@@ -152,7 +151,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_NfacesNfp){
@@ -174,7 +172,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
diff --git a/solvers/fokkerPlanck/okl/fpeDiffusionTri2D.okl b/solvers/fokkerPlanck/okl/fpeDiffusionTri2D.okl
index 5bfb03f58..46587908c 100644
--- a/solvers/fokkerPlanck/okl/fpeDiffusionTri2D.okl
+++ b/solvers/fokkerPlanck/okl/fpeDiffusionTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -173,7 +173,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NbV;++es;@inner(1)){//
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
@@ -224,7 +223,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NbV;++es;@inner(1)){
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
@@ -266,7 +264,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NbV;++es;@inner(1)){
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
diff --git a/solvers/fokkerPlanck/okl/fpeGradientHex3D.okl b/solvers/fokkerPlanck/okl/fpeGradientHex3D.okl
index ba323bb73..79349db34 100644
--- a/solvers/fokkerPlanck/okl/fpeGradientHex3D.okl
+++ b/solvers/fokkerPlanck/okl/fpeGradientHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -53,7 +53,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // loop over slabs
     for(int k=0;k<p_Nq;++k){
@@ -94,4 +93,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/fokkerPlanck/okl/fpeGradientQuad2D.okl b/solvers/fokkerPlanck/okl/fpeGradientQuad2D.okl
index 819544ecf..dc0c6b421 100644
--- a/solvers/fokkerPlanck/okl/fpeGradientQuad2D.okl
+++ b/solvers/fokkerPlanck/okl/fpeGradientQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -49,7 +49,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -80,4 +79,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/fokkerPlanck/okl/fpeGradientTet3D.okl b/solvers/fokkerPlanck/okl/fpeGradientTet3D.okl
index 7aa7d0b26..5c6c60971 100644
--- a/solvers/fokkerPlanck/okl/fpeGradientTet3D.okl
+++ b/solvers/fokkerPlanck/okl/fpeGradientTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -46,7 +46,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(dlong e=eo;e<eo+p_NblockV;++e;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
@@ -86,4 +85,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/fokkerPlanck/okl/fpeGradientTri2D.okl b/solvers/fokkerPlanck/okl/fpeGradientTri2D.okl
index b671d10ca..4cb35c0a6 100644
--- a/solvers/fokkerPlanck/okl/fpeGradientTri2D.okl
+++ b/solvers/fokkerPlanck/okl/fpeGradientTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -60,7 +60,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int e=eo;e<eo+p_NblockV;++e;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
diff --git a/solvers/fokkerPlanck/okl/fpeInitialCondition2D.okl b/solvers/fokkerPlanck/okl/fpeInitialCondition2D.okl
index 39ab66f0f..8b08b27a7 100644
--- a/solvers/fokkerPlanck/okl/fpeInitialCondition2D.okl
+++ b/solvers/fokkerPlanck/okl/fpeInitialCondition2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/fokkerPlanck/okl/fpeInitialCondition3D.okl b/solvers/fokkerPlanck/okl/fpeInitialCondition3D.okl
index 73d108995..52c06cd1e 100644
--- a/solvers/fokkerPlanck/okl/fpeInitialCondition3D.okl
+++ b/solvers/fokkerPlanck/okl/fpeInitialCondition3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/fokkerPlanck/okl/fpeMaxWaveSpeedHex3D.okl b/solvers/fokkerPlanck/okl/fpeMaxWaveSpeedHex3D.okl
index 845e4a113..379d76bbc 100644
--- a/solvers/fokkerPlanck/okl/fpeMaxWaveSpeedHex3D.okl
+++ b/solvers/fokkerPlanck/okl/fpeMaxWaveSpeedHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/fokkerPlanck/okl/fpeMaxWaveSpeedQuad2D.okl b/solvers/fokkerPlanck/okl/fpeMaxWaveSpeedQuad2D.okl
index fcc864819..12482f406 100644
--- a/solvers/fokkerPlanck/okl/fpeMaxWaveSpeedQuad2D.okl
+++ b/solvers/fokkerPlanck/okl/fpeMaxWaveSpeedQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/fokkerPlanck/okl/fpeMaxWaveSpeedTet3D.okl b/solvers/fokkerPlanck/okl/fpeMaxWaveSpeedTet3D.okl
index 7b4a72d31..a8e1c5b00 100644
--- a/solvers/fokkerPlanck/okl/fpeMaxWaveSpeedTet3D.okl
+++ b/solvers/fokkerPlanck/okl/fpeMaxWaveSpeedTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/fokkerPlanck/okl/fpeMaxWaveSpeedTri2D.okl b/solvers/fokkerPlanck/okl/fpeMaxWaveSpeedTri2D.okl
index b9f0f5af7..0aa35a25d 100644
--- a/solvers/fokkerPlanck/okl/fpeMaxWaveSpeedTri2D.okl
+++ b/solvers/fokkerPlanck/okl/fpeMaxWaveSpeedTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/fokkerPlanck/src/fpePlotFields.cpp b/solvers/fokkerPlanck/src/fpePlotFields.cpp
index 876d293e2..5eb8f4580 100644
--- a/solvers/fokkerPlanck/src/fpePlotFields.cpp
+++ b/solvers/fokkerPlanck/src/fpePlotFields.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,11 +27,11 @@ SOFTWARE.
 #include "fpe.hpp"
 
 // interpolate data to plot nodes and save to file (one per process
-void fpe_t::PlotFields(dfloat* Q, char *fileName){
+void fpe_t::PlotFields(memory<dfloat>& Q, std::string fileName){
 
   FILE *fp;
 
-  fp = fopen(fileName, "w");
+  fp = fopen(fileName.c_str(), "w");
 
   fprintf(fp, "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"BigEndian\">\n");
   fprintf(fp, "  <UnstructuredGrid>\n");
@@ -44,30 +44,36 @@ void fpe_t::PlotFields(dfloat* Q, char *fileName){
   fprintf(fp, "        <DataArray type=\"Float32\" NumberOfComponents=\"3\" Format=\"ascii\">\n");
 
   //scratch space for interpolation
-  size_t NscratchBytes = mymax(mesh.Np, mesh.plotNp)*sizeof(dfloat);
-  dfloat* scratch = (dfloat *) malloc(2*NscratchBytes);
+  size_t Nscratch = std::max(mesh.Np, mesh.plotNp);
+  memory<dfloat> scratch(2*Nscratch);
 
-  dfloat* Ix = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iy = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iz = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
+  memory<dfloat> Ix(mesh.plotNp);
+  memory<dfloat> Iy(mesh.plotNp);
+  memory<dfloat> Iz(mesh.plotNp);
 
   // compute plot node coordinates on the fly
   for(dlong e=0;e<mesh.Nelements;++e){
     mesh.PlotInterp(mesh.x + e*mesh.Np, Ix, scratch);
     mesh.PlotInterp(mesh.y + e*mesh.Np, Iy, scratch);
-    mesh.PlotInterp(mesh.z + e*mesh.Np, Iz, scratch);
+    if(mesh.dim==3)
+      mesh.PlotInterp(mesh.z + e*mesh.Np, Iz, scratch);
 
-    for(int n=0;n<mesh.plotNp;++n){
-      fprintf(fp, "       ");
-      fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],Iz[n]);
+    if (mesh.dim==2) {
+      for(int n=0;n<mesh.plotNp;++n){
+        fprintf(fp, "       ");
+        fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],0.0);
+      }
+    } else {
+      for(int n=0;n<mesh.plotNp;++n){
+        fprintf(fp, "       ");
+        fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],Iz[n]);
+      }
     }
   }
   fprintf(fp, "        </DataArray>\n");
   fprintf(fp, "      </Points>\n");
 
-  free(Ix); free(Iy); free(Iz);
-
-  dfloat* Ip = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
+  memory<dfloat> Ip(mesh.plotNp);
 
   // write out field
   fprintf(fp, "      <PointData Scalars=\"scalars\">\n");
@@ -83,8 +89,6 @@ void fpe_t::PlotFields(dfloat* Q, char *fileName){
   fprintf(fp, "       </DataArray>\n");
   fprintf(fp, "     </PointData>\n");
 
-  free(Ip);
-
   fprintf(fp, "    <Cells>\n");
   fprintf(fp, "      <DataArray type=\"Int32\" Name=\"connectivity\" Format=\"ascii\">\n");
 
@@ -125,6 +129,4 @@ void fpe_t::PlotFields(dfloat* Q, char *fileName){
   fprintf(fp, "  </UnstructuredGrid>\n");
   fprintf(fp, "</VTKFile>\n");
   fclose(fp);
-
-  free(scratch);
 }
diff --git a/solvers/fokkerPlanck/src/fpeReport.cpp b/solvers/fokkerPlanck/src/fpeReport.cpp
index d1f76988f..57a7e2ca7 100644
--- a/solvers/fokkerPlanck/src/fpeReport.cpp
+++ b/solvers/fokkerPlanck/src/fpeReport.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -34,7 +34,7 @@ void fpe_t::Report(dfloat time, int tstep){
   mesh.MassMatrixApply(o_q, o_Mq);
 
   dlong Nentries = mesh.Nelements*mesh.Np;
-  dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_q, o_Mq, mesh.comm));
+  dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_q, o_Mq, mesh.comm));
 
   if(mesh.rank==0)
     printf("%5.2f (%d), %5.2f (time, timestep, norm)\n", time, tstep, norm2);
@@ -45,11 +45,11 @@ void fpe_t::Report(dfloat time, int tstep){
     o_q.copyTo(q);
 
     // output field files
-    string name;
+    std::string name;
     settings.getSetting("OUTPUT FILE NAME", name);
     char fname[BUFSIZ];
     sprintf(fname, "%s_%04d_%04d.vtu", name.c_str(), mesh.rank, frame++);
 
-    PlotFields(q, fname);
+    PlotFields(q, std::string(fname));
   }
 }
diff --git a/solvers/fokkerPlanck/src/fpeRun.cpp b/solvers/fokkerPlanck/src/fpeRun.cpp
index 8db0aecd0..149e849b3 100644
--- a/solvers/fokkerPlanck/src/fpeRun.cpp
+++ b/solvers/fokkerPlanck/src/fpeRun.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -54,14 +54,14 @@ void fpe_t::Run(){
     dt = dtAdvc;
   } else if (settings.compareSetting("TIME INTEGRATOR","SSBDF3")) {
     dt = Nsubcycles*dtAdvc;
-    subStepper->SetTimeStep(dtAdvc);
+    subStepper.SetTimeStep(dtAdvc);
   } else {
-    dt = mymin(dtAdvc, dtDiff);
+    dt = std::min(dtAdvc, dtDiff);
   }
 
-  timeStepper->SetTimeStep(dt);
+  timeStepper.SetTimeStep(dt);
 
-  timeStepper->Run(o_q, startTime, finalTime);
+  timeStepper.Run(*this, o_q, startTime, finalTime);
 
   // output norm of final solution
   {
@@ -69,7 +69,7 @@ void fpe_t::Run(){
     mesh.MassMatrixApply(o_q, o_Mq);
 
     dlong Nentries = mesh.Nelements*mesh.Np;
-    dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_q, o_Mq, mesh.comm));
+    dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_q, o_Mq, mesh.comm));
 
     if(mesh.rank==0)
       printf("Solution norm = %17.15lg\n", norm2);
diff --git a/solvers/fokkerPlanck/src/fpeSettings.cpp b/solvers/fokkerPlanck/src/fpeSettings.cpp
index 036849eeb..c4acb693b 100644
--- a/solvers/fokkerPlanck/src/fpeSettings.cpp
+++ b/solvers/fokkerPlanck/src/fpeSettings.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,7 +27,7 @@ SOFTWARE.
 #include "fpe.hpp"
 
 //settings for fpe solver
-fpeSettings_t::fpeSettings_t(MPI_Comm& _comm):
+fpeSettings_t::fpeSettings_t(comm_t& _comm):
   settings_t(_comm) {
 
   newSetting("DATA FILE",
@@ -87,10 +87,7 @@ fpeSettings_t::fpeSettings_t(MPI_Comm& _comm):
 
 void fpeSettings_t::report() {
 
-  int rank;
-  MPI_Comm_rank(comm, &rank);
-
-  if (rank==0) {
+  if (comm.rank()==0) {
     std::cout << "Fokker Planck Settings:\n\n";
     reportSetting("DATA FILE");
     reportSetting("VISCOSITY");
@@ -132,15 +129,15 @@ void fpeSettings_t::report() {
 
 void fpeSettings_t::parseFromFile(platformSettings_t& platformSettings,
                                   meshSettings_t& meshSettings,
-                                  const string filename) {
+                                  const std::string filename) {
   //read all settings from file
   settings_t s(comm);
   s.readSettingsFromFile(filename);
 
   for(auto it = s.settings.begin(); it != s.settings.end(); ++it) {
-    setting_t* set = it->second;
-    const string name = set->getName();
-    const string val = set->getVal<string>();
+    setting_t& set = it->second;
+    const std::string name = set.getName();
+    const std::string val = set.getVal<std::string>();
     if (platformSettings.hasSetting(name))
       platformSettings.changeSetting(name, val);
     else if (meshSettings.hasSetting(name))
@@ -148,26 +145,24 @@ void fpeSettings_t::parseFromFile(platformSettings_t& platformSettings,
     else if (hasSetting(name)) //self
       changeSetting(name, val);
     else  {
-      stringstream ss;
-      ss << "Unknown setting: [" << name << "] requested";
-      LIBP_ABORT(ss.str());
+      LIBP_FORCE_ABORT("Unknown setting: [" << name << "] requested");
     }
   }
 }
 
-ellipticSettings_t* fpeSettings_t::extractEllipticSettings() {
+ellipticSettings_t fpeSettings_t::extractEllipticSettings() {
 
-  ellipticSettings_t* ellipticSettings = new ellipticSettings_t(comm);
+  ellipticSettings_t ellipticSettings(comm);
 
-  for(auto it = ellipticSettings->settings.begin(); it != ellipticSettings->settings.end(); ++it) {
-    setting_t* set = it->second;
-    const string name = set->getName();
+  for(auto it = ellipticSettings.settings.begin(); it != ellipticSettings.settings.end(); ++it) {
+    setting_t& set = it->second;
+    const std::string name = set.getName();
 
-    string val;
+    std::string val;
     getSetting("ELLIPTIC "+name, val);
 
-    set->updateVal(val);
+    set.updateVal(val);
   }
 
   return ellipticSettings;
-}
\ No newline at end of file
+}
diff --git a/solvers/fokkerPlanck/src/fpeSetup.cpp b/solvers/fokkerPlanck/src/fpeSetup.cpp
index c75123f52..947d32c48 100644
--- a/solvers/fokkerPlanck/src/fpeSetup.cpp
+++ b/solvers/fokkerPlanck/src/fpeSetup.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,19 +26,25 @@ SOFTWARE.
 
 #include "fpe.hpp"
 
-fpe_t& fpe_t::Setup(platform_t& platform, mesh_t& mesh,
-                    fpeSettings_t& settings){
+void fpe_t::Setup(platform_t& _platform, mesh_t& _mesh,
+                  fpeSettings_t& _settings){
 
-  fpe_t* fpe = new fpe_t(platform, mesh, settings);
+  platform = _platform;
+  mesh = _mesh;
+  comm = _mesh.comm;
+  settings = _settings;
 
-  settings.getSetting("VISCOSITY", fpe->mu);
+  //Trigger JIT kernel builds
+  ogs::InitializeKernels(platform, ogs::Dfloat, ogs::Add);
 
-  fpe->cubature = (settings.compareSetting("ADVECTION TYPE", "CUBATURE")) ? 1:0;
+  settings.getSetting("VISCOSITY", mu);
+
+  cubature = (settings.compareSetting("ADVECTION TYPE", "CUBATURE")) ? 1:0;
 
   //setup cubature
-  if (fpe->cubature) {
+  if (cubature) {
     mesh.CubatureSetup();
-    mesh.CubatureNodes();
+    mesh.CubaturePhysicalNodes();
   }
 
   dlong Nlocal = mesh.Nelements*mesh.Np;
@@ -47,145 +53,171 @@ fpe_t& fpe_t::Setup(platform_t& platform, mesh_t& mesh,
   //setup timeStepper
   dfloat gamma = 0.0;
   if (settings.compareSetting("TIME INTEGRATOR","AB3")){
-    fpe->timeStepper = new TimeStepper::ab3(mesh.Nelements, mesh.totalHaloPairs,
-                                              mesh.Np, 1, *fpe);
+    timeStepper.Setup<TimeStepper::ab3>(mesh.Nelements,
+                                        mesh.totalHaloPairs,
+                                        mesh.Np, 1, platform, comm);
   } else if (settings.compareSetting("TIME INTEGRATOR","LSERK4")){
-    fpe->timeStepper = new TimeStepper::lserk4(mesh.Nelements, mesh.totalHaloPairs,
-                                              mesh.Np, 1, *fpe);
+    timeStepper.Setup<TimeStepper::lserk4>(mesh.Nelements,
+                                           mesh.totalHaloPairs,
+                                           mesh.Np, 1, platform, comm);
   } else if (settings.compareSetting("TIME INTEGRATOR","DOPRI5")){
-    fpe->timeStepper = new TimeStepper::dopri5(mesh.Nelements, mesh.totalHaloPairs,
-                                              mesh.Np, 1, *fpe, mesh.comm);
+    timeStepper.Setup<TimeStepper::dopri5>(mesh.Nelements,
+                                           mesh.totalHaloPairs,
+                                           mesh.Np, 1, platform, comm);
   } else if (settings.compareSetting("TIME INTEGRATOR","EXTBDF3")){
-    fpe->timeStepper = new TimeStepper::extbdf3(mesh.Nelements, mesh.totalHaloPairs,
-                                              mesh.Np, 1, *fpe);
-    gamma = ((TimeStepper::extbdf3*) fpe->timeStepper)->getGamma();
+    timeStepper.Setup<TimeStepper::extbdf3>(mesh.Nelements,
+                                            mesh.totalHaloPairs,
+                                            mesh.Np, 1, platform, comm);
+    gamma = timeStepper.GetGamma();
   } else if (settings.compareSetting("TIME INTEGRATOR","SSBDF3")){
-    fpe->timeStepper = new TimeStepper::ssbdf3(mesh.Nelements, mesh.totalHaloPairs,
-                                              mesh.Np, 1, *fpe);
-    gamma = ((TimeStepper::ssbdf3*) fpe->timeStepper)->getGamma();
+    timeStepper.Setup<TimeStepper::ssbdf3>(mesh.Nelements,
+                                           mesh.totalHaloPairs,
+                                           mesh.Np, 1, platform, comm);
+    gamma = timeStepper.GetGamma();
   }
 
-  fpe->Nsubcycles=1;
+  Nsubcycles=1;
   if (settings.compareSetting("TIME INTEGRATOR","SSBDF3"))
-    settings.getSetting("NUMBER OF SUBCYCLES", fpe->Nsubcycles);
+    settings.getSetting("NUMBER OF SUBCYCLES", Nsubcycles);
 
   //Setup Elliptic solver
-  fpe->elliptic=NULL;
-  fpe->linearSolver=NULL;
   if (settings.compareSetting("TIME INTEGRATOR","EXTBDF3")
     ||settings.compareSetting("TIME INTEGRATOR","SSBDF3")){
 
     int NBCTypes = 7;
-    int BCType[NBCTypes] = {0,1,1,2,1,1,1}; // bc=3 => outflow => Neumann   => vBCType[3] = 2, etc.
-
-    fpe->ellipticSettings = settings.extractEllipticSettings();
+    memory<int> BCType(NBCTypes);
+    // bc=3 => outflow => Neumann   => vBCType[3] = 2, etc.
+    BCType[0] = 0;
+    BCType[1] = 1;
+    BCType[2] = 1;
+    BCType[3] = 2;
+    BCType[4] = 1;
+    BCType[5] = 1;
+    BCType[6] = 1;
+
+    ellipticSettings = _settings.extractEllipticSettings();
 
     //make a guess at dt for the lambda value
     //TODO: we should allow preconditioners to be re-setup if lambda is updated
     dfloat hmin = mesh.MinCharacteristicLength();
-    dfloat dtAdvc = fpe->Nsubcycles*hmin/((mesh.N+1.)*(mesh.N+1.));
-    dfloat lambda = gamma/(dtAdvc*fpe->mu);
-
-    fpe->elliptic = &(elliptic_t::Setup(platform, mesh, *(fpe->ellipticSettings),
-                                             lambda, NBCTypes, BCType));
-    fpe->tau = fpe->elliptic->tau;
-
-    fpe->linearSolver = linearSolver_t::Setup(fpe->elliptic->Ndofs, fpe->elliptic->Nhalo,
-                                              platform, *(fpe->ellipticSettings), mesh.comm);
+    dfloat dtAdvc = Nsubcycles*hmin/((mesh.N+1.)*(mesh.N+1.));
+    dfloat lambda = gamma/(dtAdvc*mu);
+
+    elliptic.Setup(platform, mesh, ellipticSettings,
+                   lambda, NBCTypes, BCType);
+    tau = elliptic.tau;
+
+    if (ellipticSettings.compareSetting("LINEAR SOLVER","NBPCG")){
+      linearSolver.Setup<LinearSolver::nbpcg>(elliptic.Ndofs, elliptic.Nhalo,
+                                              platform, ellipticSettings, comm);
+    } else if (ellipticSettings.compareSetting("LINEAR SOLVER","NBFPCG")){
+      linearSolver.Setup<LinearSolver::nbfpcg>(elliptic.Ndofs, elliptic.Nhalo,
+                                              platform, ellipticSettings, comm);
+    } else if (ellipticSettings.compareSetting("LINEAR SOLVER","PCG")){
+      linearSolver.Setup<LinearSolver::pcg>(elliptic.Ndofs, elliptic.Nhalo,
+                                              platform, ellipticSettings, comm);
+    } else if (ellipticSettings.compareSetting("LINEAR SOLVER","PGMRES")){
+      linearSolver.Setup<LinearSolver::pgmres>(elliptic.Ndofs, elliptic.Nhalo,
+                                              platform, ellipticSettings, comm);
+    } else if (ellipticSettings.compareSetting("LINEAR SOLVER","PMINRES")){
+      linearSolver.Setup<LinearSolver::pminres>(elliptic.Ndofs, elliptic.Nhalo,
+                                              platform, ellipticSettings, comm);
+    }
   } else {
     //set penalty
-    if (mesh.elementType==TRIANGLES ||
-        mesh.elementType==QUADRILATERALS){
-      fpe->tau = 2.0*(mesh.N+1)*(mesh.N+2)/2.0;
+    if (mesh.elementType==Mesh::TRIANGLES ||
+        mesh.elementType==Mesh::QUADRILATERALS){
+      tau = 2.0*(mesh.N+1)*(mesh.N+2)/2.0;
       if(mesh.dim==3)
-        fpe->tau *= 1.5;
+        tau *= 1.5;
     } else
-      fpe->tau = 2.0*(mesh.N+1)*(mesh.N+3);
+      tau = 2.0*(mesh.N+1)*(mesh.N+3);
   }
 
   //setup linear algebra module
-  platform.linAlg.InitKernels({"innerProd", "axpy", "max"});
+  platform.linAlg().InitKernels({"innerProd", "axpy", "max"});
 
   /*setup trace halo exchange */
-  fpe->traceHalo = mesh.HaloTraceSetup(1); //one field
+  traceHalo = mesh.HaloTraceSetup(1); //one field
 
   // compute samples of q at interpolation nodes
-  fpe->q = (dfloat*) calloc(Nlocal+Nhalo, sizeof(dfloat));
-  fpe->o_q = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), fpe->q);
+  q.malloc(Nlocal+Nhalo, 0.0);
+  o_q = platform.malloc<dfloat>(q);
 
   //storage for M*q during reporting
-  fpe->o_Mq = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), fpe->q);
+  o_Mq = platform.malloc<dfloat>(q);
   mesh.MassMatrixKernelSetup(1); // mass matrix operator
 
-  fpe->grad = (dfloat*) calloc((Nlocal+Nhalo)*4, sizeof(dfloat));
-  fpe->o_grad  = platform.malloc((Nlocal+Nhalo)*4*sizeof(dfloat), fpe->grad);
+  grad.malloc((Nlocal+Nhalo)*4, 0.0);
+  o_grad  = platform.malloc<dfloat>(grad);
 
   // OCCA build stuff
-  occa::properties kernelInfo = mesh.props; //copy base occa properties
+  properties_t kernelInfo = mesh.props; //copy base occa properties
 
   //add boundary data to kernel info
-  string dataFileName;
+  std::string dataFileName;
   settings.getSetting("DATA FILE", dataFileName);
   kernelInfo["includes"] += dataFileName;
 
   kernelInfo["defines/" "p_Nfields"]= 1;
 
-  int maxNodes = mymax(mesh.Np, (mesh.Nfp*mesh.Nfaces));
+  int maxNodes = std::max(mesh.Np, (mesh.Nfp*mesh.Nfaces));
   kernelInfo["defines/" "p_maxNodes"]= maxNodes;
 
   int blockMax = 256;
   if (platform.device.mode() == "CUDA") blockMax = 512;
 
-  int NblockV = mymax(1, blockMax/mesh.Np);
+  int NblockV = std::max(1, blockMax/mesh.Np);
   kernelInfo["defines/" "p_NblockV"]= NblockV;
 
-  int NblockS = mymax(1, blockMax/maxNodes);
+  int NblockS = std::max(1, blockMax/maxNodes);
   kernelInfo["defines/" "p_NblockS"]= NblockS;
 
-  if (fpe->cubature) {
-    int cubMaxNodes = mymax(mesh.Np, (mesh.intNfp*mesh.Nfaces));
+  if (cubature) {
+    int cubMaxNodes = std::max(mesh.Np, (mesh.intNfp*mesh.Nfaces));
     kernelInfo["defines/" "p_cubMaxNodes"]= cubMaxNodes;
-    int cubMaxNodes1 = mymax(mesh.Np, (mesh.intNfp));
+    int cubMaxNodes1 = std::max(mesh.Np, (mesh.intNfp));
     kernelInfo["defines/" "p_cubMaxNodes1"]= cubMaxNodes1;
 
-    int cubNblockV = mymax(1, blockMax/mesh.cubNp);
+    int cubNblockV = std::max(1, blockMax/mesh.cubNp);
     kernelInfo["defines/" "p_cubNblockV"]= cubNblockV;
 
-    int cubNblockS = mymax(1, blockMax/cubMaxNodes);
+    int cubNblockS = std::max(1, blockMax/cubMaxNodes);
     kernelInfo["defines/" "p_cubNblockS"]= cubNblockS;
   }
 
-  kernelInfo["parser/" "automate-add-barriers"] =  "disabled";
-
   // set kernel name suffix
-  char *suffix;
-  if(mesh.elementType==TRIANGLES)
-    suffix = strdup("Tri2D");
-  if(mesh.elementType==QUADRILATERALS)
-    suffix = strdup("Quad2D");
-  if(mesh.elementType==TETRAHEDRA)
-    suffix = strdup("Tet3D");
-  if(mesh.elementType==HEXAHEDRA)
-    suffix = strdup("Hex3D");
-
-  char fileName[BUFSIZ], kernelName[BUFSIZ];
+  std::string suffix;
+  if(mesh.elementType==Mesh::TRIANGLES)
+    suffix = "Tri2D";
+  if(mesh.elementType==Mesh::QUADRILATERALS)
+    suffix = "Quad2D";
+  if(mesh.elementType==Mesh::TETRAHEDRA)
+    suffix = "Tet3D";
+  if(mesh.elementType==Mesh::HEXAHEDRA)
+    suffix = "Hex3D";
+
+  std::string oklFilePrefix = DFPE "/okl/";
+  std::string oklFileSuffix = ".okl";
+
+  std::string fileName, kernelName;
 
   // advection kernels
-  if (fpe->cubature) {
-    sprintf(fileName, DFPE "/okl/fpeCubatureAdvection%s.okl", suffix);
-    sprintf(kernelName, "fpeAdvectionCubatureVolume%s", suffix);
-    fpe->advectionVolumeKernel =  platform.buildKernel(fileName, kernelName,
+  if (cubature) {
+    fileName   = oklFilePrefix + "fpeCubatureAdvection" + suffix + oklFileSuffix;
+    kernelName = "fpeAdvectionCubatureVolume" + suffix;
+    advectionVolumeKernel =  platform.buildKernel(fileName, kernelName,
                                            kernelInfo);
-    sprintf(kernelName, "fpeAdvectionCubatureSurface%s", suffix);
-    fpe->advectionSurfaceKernel = platform.buildKernel(fileName, kernelName,
+    kernelName = "fpeAdvectionCubatureSurface" + suffix;
+    advectionSurfaceKernel = platform.buildKernel(fileName, kernelName,
                                            kernelInfo);
   } else {
-    sprintf(fileName, DFPE "/okl/fpeAdvection%s.okl", suffix);
-    sprintf(kernelName, "fpeAdvectionVolume%s", suffix);
-    fpe->advectionVolumeKernel =  platform.buildKernel(fileName, kernelName,
+    fileName   = oklFilePrefix + "fpeAdvection" + suffix + oklFileSuffix;
+    kernelName = "fpeAdvectionVolume" + suffix;
+    advectionVolumeKernel =  platform.buildKernel(fileName, kernelName,
                                            kernelInfo);
-    sprintf(kernelName, "fpeAdvectionSurface%s", suffix);
-    fpe->advectionSurfaceKernel = platform.buildKernel(fileName, kernelName,
+    kernelName = "fpeAdvectionSurface" + suffix;
+    advectionSurfaceKernel = platform.buildKernel(fileName, kernelName,
                                            kernelInfo);
   }
 
@@ -193,72 +225,63 @@ fpe_t& fpe_t::Setup(platform_t& platform, mesh_t& mesh,
   // diffusion kernels
   if (settings.compareSetting("TIME INTEGRATOR","EXTBDF3")
     ||settings.compareSetting("TIME INTEGRATOR","SSBDF3")) {
-    sprintf(fileName, DFPE "/okl/fpeDiffusionRhs%s.okl", suffix);
-    sprintf(kernelName, "fpeDiffusionRhs%s", suffix);
-    fpe->diffusionRhsKernel =  platform.buildKernel(fileName, kernelName,
+    fileName   = oklFilePrefix + "fpeDiffusionRhs" + suffix + oklFileSuffix;
+    kernelName = "fpeDiffusionRhs" + suffix;
+    diffusionRhsKernel =  platform.buildKernel(fileName, kernelName,
                                            kernelInfo);
   } else {
     // gradient kernel
-    sprintf(fileName, DFPE "/okl/fpeGradient%s.okl", suffix);
-    sprintf(kernelName, "fpeGradient%s", suffix);
-    fpe->gradientKernel =  platform.buildKernel(fileName, kernelName,
+    fileName   = oklFilePrefix + "fpeGradient" + suffix + oklFileSuffix;
+    kernelName = "fpeGradient" + suffix;
+    gradientKernel =  platform.buildKernel(fileName, kernelName,
                                            kernelInfo);
 
-    sprintf(fileName, DFPE "/okl/fpeDiffusion%s.okl", suffix);
-    sprintf(kernelName, "fpeDiffusion%s", suffix);
-    fpe->diffusionKernel =  platform.buildKernel(fileName, kernelName,
+    fileName   = oklFilePrefix + "fpeDiffusion" + suffix + oklFileSuffix;
+    kernelName = "fpeDiffusion" + suffix;
+    diffusionKernel =  platform.buildKernel(fileName, kernelName,
                                            kernelInfo);
   }
 
   if (mesh.dim==2) {
-    sprintf(fileName, DFPE "/okl/fpeInitialCondition2D.okl");
-    sprintf(kernelName, "fpeInitialCondition2D");
+    fileName   = oklFilePrefix + "fpeInitialCondition2D" + oklFileSuffix;
+    kernelName = "fpeInitialCondition2D";
   } else {
-    sprintf(fileName, DFPE "/okl/fpeInitialCondition3D.okl");
-    sprintf(kernelName, "fpeInitialCondition3D");
+    fileName   = oklFilePrefix + "fpeInitialCondition3D" + oklFileSuffix;
+    kernelName = "fpeInitialCondition3D";
   }
 
-  fpe->initialConditionKernel = platform.buildKernel(fileName, kernelName,
+  initialConditionKernel = platform.buildKernel(fileName, kernelName,
                                                   kernelInfo);
 
-  sprintf(fileName, DFPE "/okl/fpeMaxWaveSpeed%s.okl", suffix);
-  sprintf(kernelName, "fpeMaxWaveSpeed%s", suffix);
+  fileName   = oklFilePrefix + "fpeMaxWaveSpeed" + suffix + oklFileSuffix;
+  kernelName = "fpeMaxWaveSpeed" + suffix;
 
-  fpe->maxWaveSpeedKernel = platform.buildKernel(fileName, kernelName, kernelInfo);
+  maxWaveSpeedKernel = platform.buildKernel(fileName, kernelName, kernelInfo);
 
   //build subcycler
-  fpe->subcycler=NULL;
-  fpe->subStepper=NULL;
   if (settings.compareSetting("TIME INTEGRATOR","SSBDF3")) {
-    fpe->subcycler  = new subcycler_t(*fpe);
+    subcycler.platform = platform;
+    subcycler.mesh = mesh;
+    subcycler.comm = comm;
+    subcycler.settings = settings;
+
+    subcycler.cubature = cubature;
+    subcycler.traceHalo = traceHalo;
+    subcycler.advectionVolumeKernel = advectionVolumeKernel;
+    subcycler.advectionSurfaceKernel = advectionSurfaceKernel;
+
     if (settings.compareSetting("SUBCYCLING TIME INTEGRATOR","AB3")){
-      fpe->subStepper = new TimeStepper::ab3(mesh.Nelements, mesh.totalHaloPairs,
-                                                mesh.Np, 1, *(fpe->subcycler));
+      subStepper.Setup<TimeStepper::ab3>(mesh.Nelements,
+                                         mesh.totalHaloPairs,
+                                         mesh.Np, 1, platform, comm);
     } else if (settings.compareSetting("SUBCYCLING TIME INTEGRATOR","LSERK4")){
-      fpe->subStepper = new TimeStepper::lserk4(mesh.Nelements, mesh.totalHaloPairs,
-                                                mesh.Np, 1, *(fpe->subcycler));
+      subStepper.Setup<TimeStepper::lserk4>(mesh.Nelements,
+                                            mesh.totalHaloPairs,
+                                            mesh.Np, 1, platform, comm);
     } else if (settings.compareSetting("SUBCYCLING TIME INTEGRATOR","DOPRI5")){
-      fpe->subStepper = new TimeStepper::dopri5(mesh.Nelements, mesh.totalHaloPairs,
-                                                mesh.Np, 1, *(fpe->subcycler), mesh.comm);
+      subStepper.Setup<TimeStepper::dopri5>(mesh.Nelements,
+                                            mesh.totalHaloPairs,
+                                            mesh.Np, 1, platform, comm);
     }
   }
-
-  return *fpe;
-}
-
-fpe_t::~fpe_t() {
-  advectionVolumeKernel.free();
-  advectionSurfaceKernel.free();
-  gradientKernel.free();
-  diffusionKernel.free();
-  diffusionRhsKernel.free();
-  initialConditionKernel.free();
-  maxWaveSpeedKernel.free();
-
-  if (elliptic) delete elliptic;
-  if (timeStepper) delete timeStepper;
-  if (linearSolver) delete linearSolver;
-  if (subStepper) delete subStepper;
-  if (subcycler) delete subcycler;
-  if (traceHalo) traceHalo->Free();
 }
diff --git a/solvers/fokkerPlanck/src/fpeStep.cpp b/solvers/fokkerPlanck/src/fpeStep.cpp
index cf5b63fbf..1c22d332b 100644
--- a/solvers/fokkerPlanck/src/fpeStep.cpp
+++ b/solvers/fokkerPlanck/src/fpeStep.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,10 +26,10 @@ SOFTWARE.
 
 #include "fpe.hpp"
 
-dfloat fpe_t::MaxWaveSpeed(occa::memory& o_Q, const dfloat T){
+dfloat fpe_t::MaxWaveSpeed(deviceMemory<dfloat>& o_Q, const dfloat T){
 
   //Note: if this is on the critical path in the future, we should pre-allocate this
-  occa::memory o_maxSpeed = platform.malloc(mesh.Nelements*sizeof(dfloat));
+  deviceMemory<dfloat> o_maxSpeed = platform.malloc<dfloat>(mesh.Nelements);
 
   maxWaveSpeedKernel(mesh.Nelements,
                      mesh.o_vgeo,
@@ -43,31 +43,30 @@ dfloat fpe_t::MaxWaveSpeed(occa::memory& o_Q, const dfloat T){
                      o_Q,
                      o_maxSpeed);
 
-  const dfloat vmax = platform.linAlg.max(mesh.Nelements, o_maxSpeed, mesh.comm);
+  const dfloat vmax = platform.linAlg().max(mesh.Nelements, o_maxSpeed, mesh.comm);
 
-  o_maxSpeed.free();
   return vmax;
 }
 
 //evaluate ODE rhs = f(q,t)
-void fpe_t::rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){
+void fpe_t::rhsf(deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS, const dfloat T){
   Advection(o_Q, o_RHS, T);
   Diffusion(o_Q, o_RHS, T);
 }
 
 // Evaluation of rhs f function
-void fpe_t::rhs_imex_f(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){
+void fpe_t::rhs_imex_f(deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS, const dfloat T){
   Advection(o_Q, o_RHS, T);
 }
 
 // Evaluation of rhs g function
-void fpe_t::rhs_imex_g(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){
+void fpe_t::rhs_imex_g(deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS, const dfloat T){
   Diffusion(o_Q, o_RHS, T);
 }
 
 // Inversion of diffusion operator
 //  Solves gamma*q - mu*Laplacian*q = rhs
-void fpe_t::rhs_imex_invg(occa::memory& o_RHS, occa::memory& o_Q, const dfloat gamma, const dfloat T){
+void fpe_t::rhs_imex_invg(deviceMemory<dfloat>& o_RHS, deviceMemory<dfloat>& o_Q, const dfloat gamma, const dfloat T){
 
   // rhs = MM*rhs/mu
   diffusionRhsKernel(mesh.Nelements,
@@ -91,9 +90,9 @@ void fpe_t::rhs_imex_invg(occa::memory& o_RHS, occa::memory& o_Q, const dfloat g
   int verbose =0;
 
   //call the solver to solve -Laplacian*q + lambda*q = rhs
-  dfloat tol = 1e-8;
-  elliptic->lambda = gamma/mu;
-  int iter = elliptic->Solve(*linearSolver, o_Q, o_RHS, tol, maxIter, verbose);
+  dfloat tol = (sizeof(dfloat)==sizeof(double)) ? 1.0e-8 : 1.0e-5;
+  elliptic.lambda = gamma/mu;
+  int iter = elliptic.Solve(linearSolver, o_Q, o_RHS, tol, maxIter, verbose);
 
   if (mesh.rank==0){
     printf("\rSolver iterations: %3d.  ", iter); fflush(stdout);
@@ -101,8 +100,8 @@ void fpe_t::rhs_imex_invg(occa::memory& o_RHS, occa::memory& o_Q, const dfloat g
 }
 
 // Evolve rhs f function via a sub-timestepper
-void fpe_t::rhs_subcycle_f(occa::memory& o_Q, occa::memory& o_QHAT,
-                           const dfloat T, const dfloat dt, const dfloat* B,
+void fpe_t::rhs_subcycle_f(deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_QHAT,
+                           const dfloat T, const dfloat dt, const memory<dfloat> B,
                            const int order, const int shiftIndex, const int maxOrder) {
 
   //subcycle each Lagrangian state qhat by stepping dqhat/dt = F(qhat,t)
@@ -118,21 +117,21 @@ void fpe_t::rhs_subcycle_f(occa::memory& o_Q, occa::memory& o_QHAT,
   for (int n=order;n>=0;n--) { //for each history state, starting with oldest
 
     //q at t-n*dt
-    occa::memory o_Qn = o_Q + ((shiftIndex+n)%maxOrder)*N*sizeof(dfloat);
+    deviceMemory<dfloat> o_Qn = o_Q + ((shiftIndex+n)%maxOrder)*N;
 
     //next scaled partial sum
-    platform.linAlg.axpy(N, B[n+1]/(B[n+1]+bSum), o_Qn,
-                            bSum/(B[n+1]+bSum), o_QHAT);
+    platform.linAlg().axpy(N, B[n+1]/(B[n+1]+bSum), o_Qn,
+                              bSum/(B[n+1]+bSum), o_QHAT);
     bSum += B[n+1];
 
-    subStepper->Run(o_QHAT, T-n*dt, T-(n-1)*dt);
+    subStepper.Run(subcycler, o_QHAT, T-n*dt, T-(n-1)*dt);
   }
 }
 
-void fpe_t::Advection(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T) {
+void fpe_t::Advection(deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS, const dfloat T) {
 
   // extract q halo on DEVICE
-  traceHalo->ExchangeStart(o_Q, 1, ogs_dfloat);
+  traceHalo.ExchangeStart(o_Q, 1);
 
   if (cubature)
     advectionVolumeKernel(mesh.Nelements,
@@ -159,7 +158,7 @@ void fpe_t::Advection(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T) {
                          o_Q,
                          o_RHS);
 
-  traceHalo->ExchangeFinish(o_Q, 1, ogs_dfloat);
+  traceHalo.ExchangeFinish(o_Q, 1);
 
   if (cubature)
     advectionSurfaceKernel(mesh.Nelements,
@@ -191,7 +190,7 @@ void fpe_t::Advection(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T) {
                           o_RHS);
 }
 
-void fpe_t::Diffusion(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T) {
+void fpe_t::Diffusion(deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS, const dfloat T) {
 
   //compute gradq and pack with q
   gradientKernel(mesh.Nelements,
@@ -200,7 +199,7 @@ void fpe_t::Diffusion(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T) {
                   o_Q,
                   o_grad);
 
-  traceHalo->ExchangeStart(o_grad, 4, ogs_dfloat);
+  traceHalo.ExchangeStart(o_grad, 4);
 
   if(mesh.NinternalElements)
     diffusionKernel(mesh.NinternalElements,
@@ -221,7 +220,7 @@ void fpe_t::Diffusion(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T) {
                     o_grad,
                     o_RHS);
 
-  traceHalo->ExchangeFinish(o_grad, 4, ogs_dfloat);
+  traceHalo.ExchangeFinish(o_grad, 4);
 
   if(mesh.NhaloElements)
     diffusionKernel(mesh.NhaloElements,
@@ -241,4 +240,4 @@ void fpe_t::Diffusion(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T) {
                     mu,
                     o_grad,
                     o_RHS);
-}
\ No newline at end of file
+}
diff --git a/solvers/fokkerPlanck/src/fpeSubcycle.cpp b/solvers/fokkerPlanck/src/fpeSubcycle.cpp
index 9ad2c2b6d..82211d416 100644
--- a/solvers/fokkerPlanck/src/fpeSubcycle.cpp
+++ b/solvers/fokkerPlanck/src/fpeSubcycle.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,19 +26,10 @@ SOFTWARE.
 
 #include "fpe.hpp"
 
-subcycler_t::subcycler_t(fpe_t& fpe):
-  solver_t(fpe.platform, fpe.settings), mesh(fpe.mesh) {
-
-  cubature = fpe.cubature;
-  traceHalo = fpe.traceHalo;
-  advectionVolumeKernel = fpe.advectionVolumeKernel;
-  advectionSurfaceKernel = fpe.advectionSurfaceKernel;
-}
-
 //evaluate ODE rhs = f(q,t)
-void subcycler_t::rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){
+void subcycler_t::rhsf(deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS, const dfloat T){
   // extract q halo on DEVICE
-  traceHalo->ExchangeStart(o_Q, 1, ogs_dfloat);
+  traceHalo.ExchangeStart(o_Q, 1);
 
   if (cubature)
     advectionVolumeKernel(mesh.Nelements,
@@ -65,7 +56,7 @@ void subcycler_t::rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){
                          o_Q,
                          o_RHS);
 
-  traceHalo->ExchangeFinish(o_Q, 1, ogs_dfloat);
+  traceHalo.ExchangeFinish(o_Q, 1);
 
   if (cubature)
     advectionSurfaceKernel(mesh.Nelements,
diff --git a/solvers/gradient/data/gradientCos2D.h b/solvers/gradient/data/gradientCos2D.h
index f29987648..7c81d8ac9 100644
--- a/solvers/gradient/data/gradientCos2D.h
+++ b/solvers/gradient/data/gradientCos2D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/gradient/data/gradientCos3D.h b/solvers/gradient/data/gradientCos3D.h
index 4585eaa57..9c0f2ca06 100644
--- a/solvers/gradient/data/gradientCos3D.h
+++ b/solvers/gradient/data/gradientCos3D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/gradient/gradient.hpp b/solvers/gradient/gradient.hpp
index 1920031d2..39fc0adc3 100644
--- a/solvers/gradient/gradient.hpp
+++ b/solvers/gradient/gradient.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -34,43 +34,44 @@ SOFTWARE.
 
 #define DGRADIENT LIBP_DIR"/solvers/gradient/"
 
+using namespace libp;
+
 class gradientSettings_t: public settings_t {
 public:
-  gradientSettings_t(MPI_Comm& _comm);
+  gradientSettings_t(comm_t _comm);
   void report();
   void parseFromFile(platformSettings_t& platformSettings,
                      meshSettings_t& meshSettings,
-                     const string filename);
+                     const std::string filename);
 };
 
 class gradient_t: public solver_t {
 public:
-  mesh_t& mesh;
+  mesh_t mesh;
 
   int Nfields;
 
-  dfloat *q;
-  occa::memory o_q;
+  memory<dfloat> q;
+  deviceMemory<dfloat> o_q;
 
-  dfloat *gradq;
-  occa::memory o_gradq;
+  memory<dfloat> gradq;
+  deviceMemory<dfloat> o_gradq;
 
-  occa::memory o_Mgradq;
+  deviceMemory<dfloat> o_Mgradq;
 
-  occa::kernel volumeKernel;
+  kernel_t volumeKernel;
 
-  occa::kernel initialConditionKernel;
+  kernel_t initialConditionKernel;
 
-  gradient_t() = delete;
+  gradient_t() = default;
   gradient_t(platform_t &_platform, mesh_t &_mesh,
-              gradientSettings_t& _settings):
-    solver_t(_platform, _settings), mesh(_mesh) {}
-
-  ~gradient_t();
+              gradientSettings_t& _settings) {
+    Setup(_platform, _mesh, _settings);
+  }
 
   //setup
-  static gradient_t& Setup(platform_t& platform, mesh_t& mesh,
-                          gradientSettings_t& settings);
+  void Setup(platform_t& _platform, mesh_t& _mesh,
+             gradientSettings_t& _settings);
 
   void Run();
 
diff --git a/solvers/gradient/gradientMain.cpp b/solvers/gradient/gradientMain.cpp
index 076b6924d..30bbd2085 100644
--- a/solvers/gradient/gradientMain.cpp
+++ b/solvers/gradient/gradientMain.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -29,39 +29,40 @@ SOFTWARE.
 int main(int argc, char **argv){
 
   // start up MPI
-  MPI_Init(&argc, &argv);
+  Comm::Init(argc, argv);
 
-  MPI_Comm comm = MPI_COMM_WORLD;
+  LIBP_ABORT("Usage: ./gradientMain setupfile", argc!=2);
 
-  if(argc!=2)
-    LIBP_ABORT(string("Usage: ./gradientMain setupfile"));
+  { /*Scope so everything is destructed before MPI_Finalize */
+    comm_t comm(Comm::World().Dup());
 
-  //create default settings
-  platformSettings_t platformSettings(comm);
-  meshSettings_t meshSettings(comm);
-  gradientSettings_t gradientSettings(comm);
+    //create default settings
+    platformSettings_t platformSettings(comm);
+    meshSettings_t meshSettings(comm);
+    gradientSettings_t gradientSettings(comm);
 
-  //load settings from file
-  gradientSettings.parseFromFile(platformSettings, meshSettings,
-                            argv[1]);
+    //load settings from file
+    gradientSettings.parseFromFile(platformSettings, meshSettings,
+                                    argv[1]);
 
-  // set up platform
-  platform_t platform(platformSettings);
+    // set up platform
+    platform_t platform(platformSettings);
 
-  platformSettings.report();
-  meshSettings.report();
-  gradientSettings.report();
+    platformSettings.report();
+    meshSettings.report();
+    gradientSettings.report();
 
-  // set up mesh
-  mesh_t& mesh = mesh_t::Setup(platform, meshSettings, comm);
+    // set up mesh
+    mesh_t mesh(platform, meshSettings, comm);
 
-  // set up gradient solver
-  gradient_t& gradient = gradient_t::Setup(platform, mesh, gradientSettings);
+    // set up gradient solver
+    gradient_t gradient(platform, mesh, gradientSettings);
 
-  // run
-  gradient.Run();
+    // run
+    gradient.Run();
+  }
 
   // close down MPI
-  MPI_Finalize();
+  Comm::Finalize();
   return LIBP_SUCCESS;
 }
diff --git a/solvers/gradient/makefile b/solvers/gradient/makefile
index ae03306c6..ee3ba7d66 100644
--- a/solvers/gradient/makefile
+++ b/solvers/gradient/makefile
@@ -2,7 +2,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
@@ -78,11 +78,8 @@ include ../../make.top
 endif
 endif
 
-#gslib
-GS_DIR=${LIBP_TPL_DIR}/gslib
-
 #libraries
-GRADIENT_LIBP_LIBS=mesh ogs linAlg core
+GRADIENT_LIBP_LIBS=mesh parAdogs ogs linAlg core
 
 #includes
 INCLUDES=${LIBP_INCLUDES} \
@@ -97,7 +94,6 @@ GRADIENT_CXXFLAGS=${LIBP_CXXFLAGS} ${DEFINES} ${INCLUDES}
 
 #link libraries
 LIBS=-L${LIBP_LIBS_DIR} $(addprefix -l,$(GRADIENT_LIBP_LIBS)) \
-     -L$(GS_DIR)/lib -lgs \
      ${LIBP_LIBS}
 
 #link flags
@@ -145,10 +141,10 @@ endif
 # rule for .cpp files
 %.o: %.cpp $(DEPS) | libp_libs
 ifneq (,${verbose})
-	$(LIBP_MPICXX) -o $*.o -c $*.cpp $(GRADIENT_CXXFLAGS)
+	$(LIBP_CXX) -o $*.o -c $*.cpp $(GRADIENT_CXXFLAGS)
 else
 	@printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n";
-	@$(LIBP_MPICXX) -o $*.o -c $*.cpp $(GRADIENT_CXXFLAGS)
+	@$(LIBP_CXX) -o $*.o -c $*.cpp $(GRADIENT_CXXFLAGS)
 endif
 
 #cleanup
@@ -159,8 +155,7 @@ clean-libs: clean
 	${MAKE} -C ${LIBP_LIBS_DIR} clean
 
 clean-kernels: clean-libs
-# 	$(shell ${OCCA_DIR}/bin/occa clear all -y)
-	rm -rf ~/.occa/
+	rm -rf ${LIBP_DIR}/.occa/
 
 realclean: clean
 	${MAKE} -C ${LIBP_LIBS_DIR} realclean
diff --git a/solvers/gradient/okl/gradientInitialCondition2D.okl b/solvers/gradient/okl/gradientInitialCondition2D.okl
index de6cc2fd3..1086743dd 100644
--- a/solvers/gradient/okl/gradientInitialCondition2D.okl
+++ b/solvers/gradient/okl/gradientInitialCondition2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/gradient/okl/gradientInitialCondition3D.okl b/solvers/gradient/okl/gradientInitialCondition3D.okl
index 8f8a09829..69c33fc9f 100644
--- a/solvers/gradient/okl/gradientInitialCondition3D.okl
+++ b/solvers/gradient/okl/gradientInitialCondition3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/gradient/okl/gradientVolumeHex3D.okl b/solvers/gradient/okl/gradientVolumeHex3D.okl
index cc997bf4b..7ece12235 100644
--- a/solvers/gradient/okl/gradientVolumeHex3D.okl
+++ b/solvers/gradient/okl/gradientVolumeHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -36,29 +36,23 @@ for(dlong e=0;e<Nelements;++e;@outer(0)){
     @shared dfloat s_DT[p_Nq][p_Nq];
     @shared dfloat s_q[p_Nq][p_Nq][p_Nq];
 
-    // prefetch DT
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
+        // prefetch DT
         s_DT[j][i] = DT[i + p_Nq*j];
-      }
-    }
 
-    // prefetch q
-    for(int k=0;k<p_Nq;++k){
-      for(int j=0;j<p_Nq;++j;@inner(1)){
-        for(int i=0;i<p_Nq;++i;@inner(0)){
+        // prefetch q
+        for(int k=0;k<p_Nq;++k){
           const dlong id = e*p_Np+k*p_Nq*p_Nq+j*p_Nq+i;
           s_q[k][j][i] = q[id];
         }
       }
     }
 
-    @barrier("local");
-
     // loop over slabs
-    for(int k=0;k<p_Nq;++k){
-      for(int j=0;j<p_Nq;++j;@inner(1)){
-        for(int i=0;i<p_Nq;++i;@inner(0)){
+    for(int j=0;j<p_Nq;++j;@inner(1)){
+      for(int i=0;i<p_Nq;++i;@inner(0)){
+        for(int k=0;k<p_Nq;++k){
 
           const dlong gid = i + j*p_Nq + k*p_Nq*p_Nq + e*p_Np*p_Nvgeo;
 
diff --git a/solvers/gradient/okl/gradientVolumeQuad2D.okl b/solvers/gradient/okl/gradientVolumeQuad2D.okl
index abd84160a..bfcd371b0 100644
--- a/solvers/gradient/okl/gradientVolumeQuad2D.okl
+++ b/solvers/gradient/okl/gradientVolumeQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -48,8 +48,6 @@ for(dlong e=0;e<Nelements;++e;@outer(0)){
       }
     }
 
-    @barrier("local");
-
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
 
diff --git a/solvers/gradient/okl/gradientVolumeTet3D.okl b/solvers/gradient/okl/gradientVolumeTet3D.okl
index deecff603..d88328718 100644
--- a/solvers/gradient/okl/gradientVolumeTet3D.okl
+++ b/solvers/gradient/okl/gradientVolumeTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -29,11 +29,11 @@ SOFTWARE.
 
 #if 0
 void gradientVolumeTet3D(int Nelements,
-			 dfloat *vgeo, // geometric factors
-			 dfloat *D, // D matrices
-			 dfloat *q,    // data at nodes
-			 dfloat *gradq // physical gradient
-			 ){
+                         dfloat *vgeo, // geometric factors
+                         dfloat *D, // D matrices
+                         dfloat *q,    // data at nodes
+                         dfloat *gradq // physical gradient
+                         ){
 
   // loop over all elements
   for(int e=0;e<Nelements;++e){
@@ -44,15 +44,15 @@ void gradientVolumeTet3D(int Nelements,
       dfloat qr = 0, qs = 0, qt = 0;
 
       for(int m=0;m<p_Np;++m){
-	dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
-	dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
-	dfloat dt = D[n + m*p_Np + 2*p_Np*p_Np];
+        dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
+        dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
+        dfloat dt = D[n + m*p_Np + 2*p_Np*p_Np];
 
-	int id = e*p_Np + m;
+        int id = e*p_Np + m;
 
-	qr += dr*q[id];
-	qs += ds*q[id];
-	qt += dt*q[id];
+        qr += dr*q[id];
+        qs += ds*q[id];
+        qt += dt*q[id];
       }
 
       dfloat rx = vgeo[e*p_Nvgeo + p_RXID];
@@ -84,11 +84,11 @@ void gradientVolumeTet3D(int Nelements,
 
 // annotate @kernels with "@kernel" keyword
 @kernel void gradientVolumeTet3D_v0(int Nelements,
-				   dfloat *vgeo, // geometric factors
-				   dfloat *D, // D matrices
-				   dfloat *q,    // data at nodes
-				   dfloat *gradq // physical gradient
-				   ){
+                                   dfloat *vgeo, // geometric factors
+                                   dfloat *D, // D matrices
+                                   dfloat *q,    // data at nodes
+                                   dfloat *gradq // physical gradient
+                                   ){
 
   // loop over all elements
   for(int e=0;e<Nelements;++e;@outer(0)){ // distributed amongst cores
@@ -99,15 +99,15 @@ void gradientVolumeTet3D(int Nelements,
       dfloat qr = 0, qs = 0, qt = 0;
 
       for(int m=0;m<p_Np;++m){
-	dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
-	dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
-	dfloat dt = D[n + m*p_Np + 2*p_Np*p_Np];
+        dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
+        dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
+        dfloat dt = D[n + m*p_Np + 2*p_Np*p_Np];
 
-	int id = e*p_Np + m;
+        int id = e*p_Np + m;
 
-	qr += dr*q[id];
-	qs += ds*q[id];
-	qt += dt*q[id];
+        qr += dr*q[id];
+        qs += ds*q[id];
+        qt += dt*q[id];
       }
 
       dfloat rx = vgeo[e*p_Nvgeo + p_RXID];
@@ -138,11 +138,11 @@ void gradientVolumeTet3D(int Nelements,
 
 // @kernel 1: declare po@restrict inters as  and const everything we can
 @kernel void gradientVolumeTet3D_v1(const int Nelements,
-				   @restrict const  dfloat *  vgeo, // geometric factors
-				   @restrict const  dfloat *  D, // D matrices
-				   @restrict const  dfloat *  q,    // data at nodes
-				   @restrict dfloat *  gradq // physical gradient
-				){
+                                   @restrict const  dfloat *  vgeo, // geometric factors
+                                   @restrict const  dfloat *  D, // D matrices
+                                   @restrict const  dfloat *  q,    // data at nodes
+                                   @restrict dfloat *  gradq // physical gradient
+                                ){
 
   // loop over all elements
   for(int e=0;e<Nelements;++e;@outer(0)){ // distributed amongst cores
@@ -153,15 +153,15 @@ void gradientVolumeTet3D(int Nelements,
       dfloat qr = 0, qs = 0, qt = 0;
 
       for(int m=0;m<p_Np;++m){
-	const dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
-	const dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
-	const dfloat dt = D[n + m*p_Np + 2*p_Np*p_Np];
+        const dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
+        const dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
+        const dfloat dt = D[n + m*p_Np + 2*p_Np*p_Np];
 
-	const int id = e*p_Np + m;
+        const int id = e*p_Np + m;
 
-	qr += dr*q[id];
-	qs += ds*q[id];
-	qt += dt*q[id];
+        qr += dr*q[id];
+        qs += ds*q[id];
+        qt += dt*q[id];
       }
 
       const dfloat rx = vgeo[e*p_Nvgeo + p_RXID];
@@ -192,11 +192,11 @@ void gradientVolumeTet3D(int Nelements,
 
 // @kernel 2: unroll innermost loop
 @kernel void gradientVolumeTet3D_v2(const int Nelements,
-				@restrict const  dfloat *  vgeo, // geometric factors
-				@restrict const  dfloat *  D, // D matrices
-				@restrict const  dfloat *  q,    // data at nodes
-				@restrict dfloat *  gradq // physical gradient
-				){
+                                @restrict const  dfloat *  vgeo, // geometric factors
+                                @restrict const  dfloat *  D, // D matrices
+                                @restrict const  dfloat *  q,    // data at nodes
+                                @restrict dfloat *  gradq // physical gradient
+                                ){
 
   // loop over all elements
   for(int e=0;e<Nelements;++e;@outer(0)){ // distributed amongst cores
@@ -207,17 +207,17 @@ void gradientVolumeTet3D(int Nelements,
       dfloat qr = 0, qs = 0, qt = 0;
 
       #pragma unroll p_Np
-	for(int m=0;m<p_Np;++m){
-	  const dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
-	  const dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
-	  const dfloat dt = D[n + m*p_Np + 2*p_Np*p_Np];
+        for(int m=0;m<p_Np;++m){
+          const dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
+          const dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
+          const dfloat dt = D[n + m*p_Np + 2*p_Np*p_Np];
 
-	  const int id = e*p_Np + m;
+          const int id = e*p_Np + m;
 
-	  qr += dr*q[id];
-	  qs += ds*q[id];
-	  qt += dt*q[id];
-	}
+          qr += dr*q[id];
+          qs += ds*q[id];
+          qt += dt*q[id];
+        }
 
       const dfloat rx = vgeo[e*p_Nvgeo + p_RXID];
       const dfloat sx = vgeo[e*p_Nvgeo + p_SXID];
@@ -247,11 +247,11 @@ void gradientVolumeTet3D(int Nelements,
 
 // @kernel 3: @shared memory prefetch
 @kernel void gradientVolumeTet3D_v3(const int Nelements,
-				@restrict const  dfloat *  vgeo, // geometric factors
-				@restrict const  dfloat *  D, // D matrices
-				@restrict const  dfloat *  q,    // data at nodes
-				@restrict dfloat *  gradq // physical gradient
-				){
+                                @restrict const  dfloat *  vgeo, // geometric factors
+                                @restrict const  dfloat *  D, // D matrices
+                                @restrict const  dfloat *  q,    // data at nodes
+                                @restrict dfloat *  gradq // physical gradient
+                                ){
 
   // loop over all elements
   for(int e=0;e<Nelements;++e;@outer(0)){ // distributed amongst cores
@@ -265,24 +265,21 @@ void gradientVolumeTet3D(int Nelements,
       s_q[n] = q[id];
     }
 
-    // make sure all values are prefetched
-    @barrier("local");
-
     // loop over all nodes in element e
     for(int n=0;n<p_Np;++n;@inner(0)){ // distributed to thread
 
       dfloat qr = 0, qs = 0, qt = 0;
 
       #pragma unroll p_Np
-	for(int m=0;m<p_Np;++m){
-	  const dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
-	  const dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
-	  const dfloat dt = D[n + m*p_Np + 2*p_Np*p_Np];
-	  const dfloat qm = s_q[m];
-	  qr += dr*qm;
-	  qs += ds*qm;
-	  qt += dt*qm;
-	}
+        for(int m=0;m<p_Np;++m){
+          const dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
+          const dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
+          const dfloat dt = D[n + m*p_Np + 2*p_Np*p_Np];
+          const dfloat qm = s_q[m];
+          qr += dr*qm;
+          qs += ds*qm;
+          qt += dt*qm;
+        }
 
       const dfloat rx = vgeo[e*p_Nvgeo + p_RXID];
       const dfloat sx = vgeo[e*p_Nvgeo + p_SXID];
@@ -312,11 +309,11 @@ void gradientVolumeTet3D(int Nelements,
 
 // @kernel 4: multiple nodes per thread
 @kernel void gradientVolumeTet3D_v4(const int Nelements,
-				@restrict const  dfloat *  vgeo, // geometric factors
-				@restrict const  dfloat *  D, // D matrices
-				@restrict const  dfloat *  q,    // data at nodes
-				@restrict dfloat *  gradq // physical gradient
-				){
+                                @restrict const  dfloat *  vgeo, // geometric factors
+                                @restrict const  dfloat *  D, // D matrices
+                                @restrict const  dfloat *  q,    // data at nodes
+                                @restrict dfloat *  gradq // physical gradient
+                                ){
 
 #define p_Nblock 4
 
@@ -329,86 +326,83 @@ void gradientVolumeTet3D(int Nelements,
 
       // prefetch to @shared
       #pragma unroll p_Nblock
-	for(int es=0;es<p_Nblock;++es){
-	  const int e = eo + es;
-	  if(e<Nelements){
-	    const int id = e*p_Np + n;
-	    s_q[es][n] = q[id];
-	  }
-	}
+        for(int es=0;es<p_Nblock;++es){
+          const int e = eo + es;
+          if(e<Nelements){
+            const int id = e*p_Np + n;
+            s_q[es][n] = q[id];
+          }
+        }
     }
 
-    // make sure all values are prefetched
-    @barrier("local");
-
     // loop over all nodes in element e
     for(int n=0;n<p_Np;++n;@inner(0)){ // distributed to thread
 
       dfloat qr[p_Nblock], qs[p_Nblock], qt[p_Nblock];
 
       #pragma unroll p_Nblock
-	for(int es=0;es<p_Nblock;++es){
-	  qr[es] = 0;
-	  qs[es] = 0;
-	  qt[es] = 0;
-	}
+        for(int es=0;es<p_Nblock;++es){
+          qr[es] = 0;
+          qs[es] = 0;
+          qt[es] = 0;
+        }
 
       #pragma unroll p_Np
-	for(int m=0;m<p_Np;++m){
-	  // Vasily Volkov "multiple outputs" paper
-	  const dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
-	  const dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
-	  const dfloat dt = D[n + m*p_Np + 2*p_Np*p_Np];
-
-	  #pragma unroll p_Nblock
-	    for(int es=0;es<p_Nblock;++es){
-
-	      const dfloat qm = s_q[es][m];
-	      qr[es] += dr*qm;
-	      qs[es] += ds*qm;
-	      qt[es] += dt*qm;
-	    }
-	}
+        for(int m=0;m<p_Np;++m){
+          // Vasily Volkov "multiple outputs" paper
+          const dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
+          const dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
+          const dfloat dt = D[n + m*p_Np + 2*p_Np*p_Np];
+
+          #pragma unroll p_Nblock
+            for(int es=0;es<p_Nblock;++es){
+
+              const dfloat qm = s_q[es][m];
+              qr[es] += dr*qm;
+              qs[es] += ds*qm;
+              qt[es] += dt*qm;
+            }
+        }
 
       #pragma unroll p_Nblock
-	for(int es=0;es<p_Nblock;++es){
-	  const int e = eo + es;
-	  if(e<Nelements){
+        for(int es=0;es<p_Nblock;++es){
+          const int e = eo + es;
+          if(e<Nelements){
 
-	    const dfloat rx = vgeo[e*p_Nvgeo + p_RXID];
-	    const dfloat sx = vgeo[e*p_Nvgeo + p_SXID];
-	    const dfloat tx = vgeo[e*p_Nvgeo + p_TXID];
+            const dfloat rx = vgeo[e*p_Nvgeo + p_RXID];
+            const dfloat sx = vgeo[e*p_Nvgeo + p_SXID];
+            const dfloat tx = vgeo[e*p_Nvgeo + p_TXID];
 
-	    const dfloat ry = vgeo[e*p_Nvgeo + p_RYID];
-	    const dfloat sy = vgeo[e*p_Nvgeo + p_SYID];
-	    const dfloat ty = vgeo[e*p_Nvgeo + p_TYID];
+            const dfloat ry = vgeo[e*p_Nvgeo + p_RYID];
+            const dfloat sy = vgeo[e*p_Nvgeo + p_SYID];
+            const dfloat ty = vgeo[e*p_Nvgeo + p_TYID];
 
-	    const dfloat rz = vgeo[e*p_Nvgeo + p_RZID];
-	    const dfloat sz = vgeo[e*p_Nvgeo + p_SZID];
-	    const dfloat tz = vgeo[e*p_Nvgeo + p_TZID];
+            const dfloat rz = vgeo[e*p_Nvgeo + p_RZID];
+            const dfloat sz = vgeo[e*p_Nvgeo + p_SZID];
+            const dfloat tz = vgeo[e*p_Nvgeo + p_TZID];
 
-	    const dfloat qx = rx*qr[es] + sx*qs[es] + tx*qt[es];
-	    const dfloat qy = ry*qr[es] + sy*qs[es] + ty*qt[es];
-	    const dfloat qz = rz*qr[es] + sz*qs[es] + tz*qt[es];
+            const dfloat qx = rx*qr[es] + sx*qs[es] + tx*qt[es];
+            const dfloat qy = ry*qr[es] + sy*qs[es] + ty*qt[es];
+            const dfloat qz = rz*qr[es] + sz*qs[es] + tz*qt[es];
 
-	    const int id = e*p_Np*3 + n;
+            const int id = e*p_Np*3 + n;
 
-	    gradq[id + 0*p_Np] = qx;
-	    gradq[id + 1*p_Np] = qy;
-	    gradq[id + 2*p_Np] = qz;
-	  }
-	}
+            gradq[id + 0*p_Np] = qx;
+            gradq[id + 1*p_Np] = qy;
+            gradq[id + 2*p_Np] = qz;
+          }
+        }
     }
   }
 }
 
 // @kernel 5: simd cramming
 @kernel void gradientVolumeTet3D(const int Nelements,
-				@restrict const  dfloat *  vgeo, // geometric factors
-				@restrict const  dfloat *  D, // D matrices
-				@restrict const  dfloat *  q,    // data at nodes
-				@restrict dfloat *  gradq // physical gradient
-				){
+                                @restrict const  dfloat *  vgeo, // geometric factors
+                                @restrict const  dfloat *  D, // D matrices
+                                @restrict const  dfloat *  q,    // data at nodes
+                                @restrict dfloat *  gradq // physical gradient
+                                ){
 
 #define p_Nvec 1
 #define p_Nblock 2
@@ -421,79 +415,76 @@ void gradientVolumeTet3D(int Nelements,
     for(int et=0;et<p_Nvec;++et;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){ // distributed to thread
 
-	// prefetch to @shared
-	#pragma unroll p_Nblock
-	  for(int es=0;es<p_Nblock;++es){
-	    const int e = eo + et + p_Nvec*es;
-	    if(e<Nelements){
-	      const int id = e*p_Np + n;
-	      s_q[es][et][n] = q[id];
-	    }
-	  }
+        // prefetch to @shared
+        #pragma unroll p_Nblock
+          for(int es=0;es<p_Nblock;++es){
+            const int e = eo + et + p_Nvec*es;
+            if(e<Nelements){
+              const int id = e*p_Np + n;
+              s_q[es][et][n] = q[id];
+            }
+          }
       }
     }
 
-    // make sure all values are prefetched
-    @barrier("local");
-
     // loop over all nodes in element e
     for(int et=0;et<p_Nvec;++et;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){ // distributed to thread
 
-	dfloat qr[p_Nblock], qs[p_Nblock], qt[p_Nblock];
-
-	#pragma unroll p_Nblock
-	  for(int es=0;es<p_Nblock;++es){
-	    qr[es] = 0;
-	    qs[es] = 0;
-	    qt[es] = 0;
-	  }
-
-	#pragma unroll p_Np
-	  for(int m=0;m<p_Np;++m){
-	    // Vasily Volkov "multiple outputs" paper
-	    const dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
-	    const dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
-	    const dfloat dt = D[n + m*p_Np + 2*p_Np*p_Np];
-
-	    #pragma unroll p_Nblock
-	      for(int es=0;es<p_Nblock;++es){
-
-		const dfloat qm = s_q[es][et][m];
-		qr[es] += dr*qm;
-		qs[es] += ds*qm;
-		qt[es] += dt*qm;
-	      }
-	  }
-
-	#pragma unroll p_Nblock
-	  for(int es=0;es<p_Nblock;++es){
-	    const int e = eo + et + p_Nvec*es;
-	    if(e<Nelements){
-
-	      const dfloat rx = vgeo[e*p_Nvgeo + p_RXID];
-	      const dfloat sx = vgeo[e*p_Nvgeo + p_SXID];
-	      const dfloat tx = vgeo[e*p_Nvgeo + p_TXID];
-
-	      const dfloat ry = vgeo[e*p_Nvgeo + p_RYID];
-	      const dfloat sy = vgeo[e*p_Nvgeo + p_SYID];
-	      const dfloat ty = vgeo[e*p_Nvgeo + p_TYID];
-
-	      const dfloat rz = vgeo[e*p_Nvgeo + p_RZID];
-	      const dfloat sz = vgeo[e*p_Nvgeo + p_SZID];
-	      const dfloat tz = vgeo[e*p_Nvgeo + p_TZID];
-
-	      const dfloat qx = rx*qr[es] + sx*qs[es] + tx*qt[es];
-	      const dfloat qy = ry*qr[es] + sy*qs[es] + ty*qt[es];
-	      const dfloat qz = rz*qr[es] + sz*qs[es] + tz*qt[es];
-
-	      const int id = e*p_Np*3 + n;
-
-	      gradq[id + 0*p_Np] = qx;
-	      gradq[id + 1*p_Np] = qy;
-	      gradq[id + 2*p_Np] = qz;
-	    }
-	}
+        dfloat qr[p_Nblock], qs[p_Nblock], qt[p_Nblock];
+
+        #pragma unroll p_Nblock
+          for(int es=0;es<p_Nblock;++es){
+            qr[es] = 0;
+            qs[es] = 0;
+            qt[es] = 0;
+          }
+
+        #pragma unroll p_Np
+          for(int m=0;m<p_Np;++m){
+            // Vasily Volkov "multiple outputs" paper
+            const dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
+            const dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
+            const dfloat dt = D[n + m*p_Np + 2*p_Np*p_Np];
+
+            #pragma unroll p_Nblock
+              for(int es=0;es<p_Nblock;++es){
+
+                const dfloat qm = s_q[es][et][m];
+                qr[es] += dr*qm;
+                qs[es] += ds*qm;
+                qt[es] += dt*qm;
+              }
+          }
+
+        #pragma unroll p_Nblock
+          for(int es=0;es<p_Nblock;++es){
+            const int e = eo + et + p_Nvec*es;
+            if(e<Nelements){
+
+              const dfloat rx = vgeo[e*p_Nvgeo + p_RXID];
+              const dfloat sx = vgeo[e*p_Nvgeo + p_SXID];
+              const dfloat tx = vgeo[e*p_Nvgeo + p_TXID];
+
+              const dfloat ry = vgeo[e*p_Nvgeo + p_RYID];
+              const dfloat sy = vgeo[e*p_Nvgeo + p_SYID];
+              const dfloat ty = vgeo[e*p_Nvgeo + p_TYID];
+
+              const dfloat rz = vgeo[e*p_Nvgeo + p_RZID];
+              const dfloat sz = vgeo[e*p_Nvgeo + p_SZID];
+              const dfloat tz = vgeo[e*p_Nvgeo + p_TZID];
+
+              const dfloat qx = rx*qr[es] + sx*qs[es] + tx*qt[es];
+              const dfloat qy = ry*qr[es] + sy*qs[es] + ty*qt[es];
+              const dfloat qz = rz*qr[es] + sz*qs[es] + tz*qt[es];
+
+              const int id = e*p_Np*3 + n;
+
+              gradq[id + 0*p_Np] = qx;
+              gradq[id + 1*p_Np] = qy;
+              gradq[id + 2*p_Np] = qz;
+            }
+        }
       }
     }
   }
diff --git a/solvers/gradient/okl/gradientVolumeTri2D.okl b/solvers/gradient/okl/gradientVolumeTri2D.okl
index 755322463..1ccc757e3 100644
--- a/solvers/gradient/okl/gradientVolumeTri2D.okl
+++ b/solvers/gradient/okl/gradientVolumeTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -29,11 +29,11 @@ SOFTWARE.
 
 #if 0
 void gradientVolumeTri2D(int Nelements,
-			 dfloat *vgeo, // geometric factors
-			 dfloat *D, // D matrices
-			 dfloat *q,    // data at nodes
-			 dfloat *gradq // physical gradient
-			 ){
+                         dfloat *vgeo, // geometric factors
+                         dfloat *D, // D matrices
+                         dfloat *q,    // data at nodes
+                         dfloat *gradq // physical gradient
+                         ){
 
   // loop over all elements
   for(int e=0;e<Nelements;++e){
@@ -44,13 +44,13 @@ void gradientVolumeTri2D(int Nelements,
       dfloat qr = 0, qs = 0;
 
       for(int m=0;m<p_Np;++m){
-	dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
-	dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
+        dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
+        dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
 
-	int id = e*p_Np + m;
+        int id = e*p_Np + m;
 
-	qr += dr*q[id];
-	qs += ds*q[id];
+        qr += dr*q[id];
+        qs += ds*q[id];
       }
 
       dfloat rx = vgeo[e*p_Nvgeo + p_RXID];
@@ -74,11 +74,11 @@ void gradientVolumeTri2D(int Nelements,
 
 // annotate @kernels with "@kernel" keyword
 @kernel void gradientVolumeTri2D_v0(int Nelements,
-				   dfloat *vgeo, // geometric factors
-				   dfloat *D, // D matrices
-				   dfloat *q,    // data at nodes
-				   dfloat *gradq // physical gradient
-				   ){
+                                   dfloat *vgeo, // geometric factors
+                                   dfloat *D, // D matrices
+                                   dfloat *q,    // data at nodes
+                                   dfloat *gradq // physical gradient
+                                   ){
 
   // loop over all elements
   for(int e=0;e<Nelements;++e;@outer(0)){ // distributed amongst cores
@@ -89,13 +89,13 @@ void gradientVolumeTri2D(int Nelements,
       dfloat qr = 0, qs = 0;
 
       for(int m=0;m<p_Np;++m){
-	dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
-	dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
+        dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
+        dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
 
-	int id = e*p_Np + m;
+        int id = e*p_Np + m;
 
-	qr += dr*q[id];
-	qs += ds*q[id];
+        qr += dr*q[id];
+        qs += ds*q[id];
       }
 
       dfloat rx = vgeo[e*p_Nvgeo + p_RXID];
@@ -118,11 +118,11 @@ void gradientVolumeTri2D(int Nelements,
 
 // @kernel 1: declare po@restrict inters as  and const everything we can
 @kernel void gradientVolumeTri2D_v1(const int Nelements,
-				   @restrict const  dfloat *  vgeo, // geometric factors
-				   @restrict const  dfloat *  D, // D matrices
-				   @restrict const  dfloat *  q,    // data at nodes
-				   @restrict dfloat *  gradq // physical gradient
-				   ){
+                                   @restrict const  dfloat *  vgeo, // geometric factors
+                                   @restrict const  dfloat *  D, // D matrices
+                                   @restrict const  dfloat *  q,    // data at nodes
+                                   @restrict dfloat *  gradq // physical gradient
+                                   ){
 
   // loop over all elements
   for(int e=0;e<Nelements;++e;@outer(0)){ // distributed amongst cores
@@ -133,13 +133,13 @@ void gradientVolumeTri2D(int Nelements,
       dfloat qr = 0, qs = 0;
 
       for(int m=0;m<p_Np;++m){
-	const dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
-	const dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
+        const dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
+        const dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
 
-	const int id = e*p_Np + m;
+        const int id = e*p_Np + m;
 
-	qr += dr*q[id];
-	qs += ds*q[id];
+        qr += dr*q[id];
+        qs += ds*q[id];
       }
 
       const dfloat rx = vgeo[e*p_Nvgeo + p_RXID];
@@ -161,11 +161,11 @@ void gradientVolumeTri2D(int Nelements,
 
 // @kernel 2: unroll innermost loop
 @kernel void gradientVolumeTri2D_v2(const int Nelements,
-				   @restrict const  dfloat *  vgeo, // geometric factors
-				   @restrict const  dfloat *  D, // D matrices
-				   @restrict const  dfloat *  q,    // data at nodes
-				   @restrict dfloat *  gradq // physical gradient
-				   ){
+                                   @restrict const  dfloat *  vgeo, // geometric factors
+                                   @restrict const  dfloat *  D, // D matrices
+                                   @restrict const  dfloat *  q,    // data at nodes
+                                   @restrict dfloat *  gradq // physical gradient
+                                   ){
 
   // loop over all elements
   for(int e=0;e<Nelements;++e;@outer(0)){ // distributed amongst cores
@@ -176,15 +176,15 @@ void gradientVolumeTri2D(int Nelements,
       dfloat qr = 0, qs = 0;
 
       #pragma unroll p_Np
-	for(int m=0;m<p_Np;++m){
-	  const dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
-	  const dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
+        for(int m=0;m<p_Np;++m){
+          const dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
+          const dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
 
-	  const int id = e*p_Np + m;
+          const int id = e*p_Np + m;
 
-	  qr += dr*q[id];
-	  qs += ds*q[id];
-	}
+          qr += dr*q[id];
+          qs += ds*q[id];
+        }
 
       const dfloat rx = vgeo[e*p_Nvgeo + p_RXID];
       const dfloat sx = vgeo[e*p_Nvgeo + p_SXID];
@@ -205,11 +205,11 @@ void gradientVolumeTri2D(int Nelements,
 
 // @kernel 3: @shared memory prefetch
 @kernel void gradientVolumeTri2D_v3(const int Nelements,
-				   @restrict const  dfloat *  vgeo, // geometric factors
-				   @restrict const  dfloat *  D, // D matrices
-				   @restrict const  dfloat *  q,    // data at nodes
-				   @restrict dfloat *  gradq // physical gradient
-				   ){
+                                   @restrict const  dfloat *  vgeo, // geometric factors
+                                   @restrict const  dfloat *  D, // D matrices
+                                   @restrict const  dfloat *  q,    // data at nodes
+                                   @restrict dfloat *  gradq // physical gradient
+                                   ){
 
   // loop over all elements
   for(int e=0;e<Nelements;++e;@outer(0)){ // distributed amongst cores
@@ -223,23 +223,20 @@ void gradientVolumeTri2D(int Nelements,
       s_q[n] = q[id];
     }
 
-    // make sure all values are prefetched
-    @barrier("local");
-
     // loop over all nodes in element e
     for(int n=0;n<p_Np;++n;@inner(0)){ // distributed to thread
 
       dfloat qr = 0, qs = 0;
 
       #pragma unroll p_Np
-	for(int m=0;m<p_Np;++m){
-	  const dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
-	  const dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
+        for(int m=0;m<p_Np;++m){
+          const dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
+          const dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
 
-	  const dfloat qm = s_q[m];
-	  qr += dr*qm;
-	  qs += ds*qm;
-	}
+          const dfloat qm = s_q[m];
+          qr += dr*qm;
+          qs += ds*qm;
+        }
 
       const dfloat rx = vgeo[e*p_Nvgeo + p_RXID];
       const dfloat sx = vgeo[e*p_Nvgeo + p_SXID];
@@ -261,11 +258,11 @@ void gradientVolumeTri2D(int Nelements,
 
 // @kernel 4: multiple nodes per thread
 @kernel void gradientVolumeTri2D_v4(const int Nelements,
-				   @restrict const  dfloat *  vgeo, // geometric factors
-				   @restrict const  dfloat *  D, // D matrices
-				   @restrict const  dfloat *  q,    // data at nodes
-				   @restrict dfloat *  gradq // physical gradient
-				   ){
+                                   @restrict const  dfloat *  vgeo, // geometric factors
+                                   @restrict const  dfloat *  D, // D matrices
+                                   @restrict const  dfloat *  q,    // data at nodes
+                                   @restrict dfloat *  gradq // physical gradient
+                                   ){
 
 #define p_Nblock 4
 
@@ -278,75 +275,72 @@ void gradientVolumeTri2D(int Nelements,
 
       // prefetch to @shared
       #pragma unroll p_Nblock
-	for(int es=0;es<p_Nblock;++es){
-	  const int e = eo + es;
-	  if(e<Nelements){
-	    const int id = e*p_Np + n;
-	    s_q[es][n] = q[id];
-	  }
-	}
+        for(int es=0;es<p_Nblock;++es){
+          const int e = eo + es;
+          if(e<Nelements){
+            const int id = e*p_Np + n;
+            s_q[es][n] = q[id];
+          }
+        }
     }
 
-    // make sure all values are prefetched
-    @barrier("local");
-
     // loop over all nodes in element e
     for(int n=0;n<p_Np;++n;@inner(0)){ // distributed to thread
 
       dfloat qr[p_Nblock], qs[p_Nblock];
 
       #pragma unroll p_Nblock
-	for(int es=0;es<p_Nblock;++es){
-	  qr[es] = 0;
-	  qs[es] = 0;
-	}
+        for(int es=0;es<p_Nblock;++es){
+          qr[es] = 0;
+          qs[es] = 0;
+        }
 
       #pragma unroll p_Np
-	for(int m=0;m<p_Np;++m){
-	  // Vasily Volkov "multiple outputs" paper
-	  const dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
-	  const dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
+        for(int m=0;m<p_Np;++m){
+          // Vasily Volkov "multiple outputs" paper
+          const dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
+          const dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
 
-	  #pragma unroll p_Nblock
-	    for(int es=0;es<p_Nblock;++es){
+          #pragma unroll p_Nblock
+            for(int es=0;es<p_Nblock;++es){
 
-	      const dfloat qm = s_q[es][m];
-	      qr[es] += dr*qm;
-	      qs[es] += ds*qm;
-	    }
-	}
+              const dfloat qm = s_q[es][m];
+              qr[es] += dr*qm;
+              qs[es] += ds*qm;
+            }
+        }
 
       #pragma unroll p_Nblock
-	for(int es=0;es<p_Nblock;++es){
-	  const int e = eo + es;
-	  if(e<Nelements){
+        for(int es=0;es<p_Nblock;++es){
+          const int e = eo + es;
+          if(e<Nelements){
 
-	    const dfloat rx = vgeo[e*p_Nvgeo + p_RXID];
-	    const dfloat sx = vgeo[e*p_Nvgeo + p_SXID];
+            const dfloat rx = vgeo[e*p_Nvgeo + p_RXID];
+            const dfloat sx = vgeo[e*p_Nvgeo + p_SXID];
 
-	    const dfloat ry = vgeo[e*p_Nvgeo + p_RYID];
-	    const dfloat sy = vgeo[e*p_Nvgeo + p_SYID];
+            const dfloat ry = vgeo[e*p_Nvgeo + p_RYID];
+            const dfloat sy = vgeo[e*p_Nvgeo + p_SYID];
 
-	    const dfloat qx = rx*qr[es] + sx*qs[es];
-	    const dfloat qy = ry*qr[es] + sy*qs[es];
+            const dfloat qx = rx*qr[es] + sx*qs[es];
+            const dfloat qy = ry*qr[es] + sy*qs[es];
 
-	    const int id = e*p_Np*p_dim + n;
+            const int id = e*p_Np*p_dim + n;
 
-	    gradq[id + 0*p_Np] = qx;
-	    gradq[id + 1*p_Np] = qy;
-	  }
-	}
+            gradq[id + 0*p_Np] = qx;
+            gradq[id + 1*p_Np] = qy;
+          }
+        }
     }
   }
 }
 
 // @kernel 5: simd cramming
 @kernel void gradientVolumeTri2D(const int Nelements,
-				@restrict const  dfloat *  vgeo, // geometric factors
-				@restrict const  dfloat *  D, // D matrices
-				@restrict const  dfloat *  q,    // data at nodes
-				@restrict dfloat *  gradq // physical gradient
-				){
+                                @restrict const  dfloat *  vgeo, // geometric factors
+                                @restrict const  dfloat *  D, // D matrices
+                                @restrict const  dfloat *  q,    // data at nodes
+                                @restrict dfloat *  gradq // physical gradient
+                                ){
 
 #define p_Nvec 1
 #define p_Nblock 5
@@ -359,68 +353,65 @@ void gradientVolumeTri2D(int Nelements,
     for(int et=0;et<p_Nvec;++et;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){ // distributed to thread
 
-	// prefetch to @shared
-	#pragma unroll p_Nblock
-	  for(int es=0;es<p_Nblock;++es){
-	    const int e = eo + et + p_Nvec*es;
-	    if(e<Nelements){
-	      const int id = e*p_Np + n;
-	      s_q[es][et][n] = q[id];
-	    }
-	  }
+        // prefetch to @shared
+        #pragma unroll p_Nblock
+          for(int es=0;es<p_Nblock;++es){
+            const int e = eo + et + p_Nvec*es;
+            if(e<Nelements){
+              const int id = e*p_Np + n;
+              s_q[es][et][n] = q[id];
+            }
+          }
       }
     }
 
-    // make sure all values are prefetched
-    @barrier("local");
-
     // loop over all nodes in element e
     for(int et=0;et<p_Nvec;++et;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){ // distributed to thread
 
-	dfloat qr[p_Nblock], qs[p_Nblock];
+        dfloat qr[p_Nblock], qs[p_Nblock];
 
-	#pragma unroll p_Nblock
-	  for(int es=0;es<p_Nblock;++es){
-	    qr[es] = 0;
-	    qs[es] = 0;
-	  }
+        #pragma unroll p_Nblock
+          for(int es=0;es<p_Nblock;++es){
+            qr[es] = 0;
+            qs[es] = 0;
+          }
 
-	#pragma unroll p_Np
-	  for(int m=0;m<p_Np;++m){
-	    // Vasily Volkov "multiple outputs" paper
-	    const dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
-	    const dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
+        #pragma unroll p_Np
+          for(int m=0;m<p_Np;++m){
+            // Vasily Volkov "multiple outputs" paper
+            const dfloat dr = D[n + m*p_Np + 0*p_Np*p_Np];
+            const dfloat ds = D[n + m*p_Np + 1*p_Np*p_Np];
 
-	    #pragma unroll p_Nblock
-	      for(int es=0;es<p_Nblock;++es){
+            #pragma unroll p_Nblock
+              for(int es=0;es<p_Nblock;++es){
 
-		const dfloat qm = s_q[es][et][m];
-		qr[es] += dr*qm;
-		qs[es] += ds*qm;
-	      }
-	  }
+                const dfloat qm = s_q[es][et][m];
+                qr[es] += dr*qm;
+                qs[es] += ds*qm;
+              }
+          }
 
-	#pragma unroll p_Nblock
-	  for(int es=0;es<p_Nblock;++es){
-	    const int e = eo + et + p_Nvec*es;
-	    if(e<Nelements){
+        #pragma unroll p_Nblock
+          for(int es=0;es<p_Nblock;++es){
+            const int e = eo + et + p_Nvec*es;
+            if(e<Nelements){
 
-	      const dfloat rx = vgeo[e*p_Nvgeo + p_RXID];
-	      const dfloat sx = vgeo[e*p_Nvgeo + p_SXID];
+              const dfloat rx = vgeo[e*p_Nvgeo + p_RXID];
+              const dfloat sx = vgeo[e*p_Nvgeo + p_SXID];
 
-	      const dfloat ry = vgeo[e*p_Nvgeo + p_RYID];
-	      const dfloat sy = vgeo[e*p_Nvgeo + p_SYID];
+              const dfloat ry = vgeo[e*p_Nvgeo + p_RYID];
+              const dfloat sy = vgeo[e*p_Nvgeo + p_SYID];
 
-	      const dfloat qx = rx*qr[es] + sx*qs[es];
-	      const dfloat qy = ry*qr[es] + sy*qs[es];
+              const dfloat qx = rx*qr[es] + sx*qs[es];
+              const dfloat qy = ry*qr[es] + sy*qs[es];
 
-	      const int id = e*p_Np*p_dim + n;
+              const int id = e*p_Np*p_dim + n;
 
-	      gradq[id + 0*p_Np] = qx;
-	      gradq[id + 1*p_Np] = qy;
-	    }
-	  }
+              gradq[id + 0*p_Np] = qx;
+              gradq[id + 1*p_Np] = qy;
+            }
+          }
       }
     }
   }
diff --git a/solvers/gradient/src/gradientPlotFields.cpp b/solvers/gradient/src/gradientPlotFields.cpp
index c7d3b944b..41badd9a4 100644
--- a/solvers/gradient/src/gradientPlotFields.cpp
+++ b/solvers/gradient/src/gradientPlotFields.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -31,7 +31,7 @@ void gradient_t::PlotFields(){
 
   FILE *fp;
 
-  string fname = "gradient.vtu";
+  std::string fname = "gradient.vtu";
   const char* fileName = fname.c_str();
   fp = fopen(fileName, "w");
 
@@ -46,33 +46,39 @@ void gradient_t::PlotFields(){
   fprintf(fp, "        <DataArray type=\"Float32\" NumberOfComponents=\"3\" Format=\"ascii\">\n");
 
   //scratch space for interpolation
-  size_t NscratchBytes = mymax(mesh.Np, mesh.plotNp)*sizeof(dfloat);
-  dfloat* scratch = (dfloat *) malloc(2*NscratchBytes);
+  size_t Nscratch = std::max(mesh.Np, mesh.plotNp);
+  memory<dfloat> scratch(2*Nscratch);
 
-  dfloat* Ix = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iy = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iz = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
+  memory<dfloat> Ix(mesh.plotNp);
+  memory<dfloat> Iy(mesh.plotNp);
+  memory<dfloat> Iz(mesh.plotNp);
 
   // compute plot node coordinates on the fly
   for(dlong e=0;e<mesh.Nelements;++e){
     mesh.PlotInterp(mesh.x + e*mesh.Np, Ix, scratch);
     mesh.PlotInterp(mesh.y + e*mesh.Np, Iy, scratch);
-    mesh.PlotInterp(mesh.z + e*mesh.Np, Iz, scratch);
+    if(mesh.dim==3)
+      mesh.PlotInterp(mesh.z + e*mesh.Np, Iz, scratch);
 
-    for(int n=0;n<mesh.plotNp;++n){
-      fprintf(fp, "       ");
-      fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],Iz[n]);
+    if (mesh.dim==2) {
+      for(int n=0;n<mesh.plotNp;++n){
+        fprintf(fp, "       ");
+        fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],0.0);
+      }
+    } else {
+      for(int n=0;n<mesh.plotNp;++n){
+        fprintf(fp, "       ");
+        fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],Iz[n]);
+      }
     }
   }
   fprintf(fp, "        </DataArray>\n");
   fprintf(fp, "      </Points>\n");
 
-  free(Ix); free(Iy); free(Iz);
-
-  dfloat* Iq = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iu = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iv = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iw = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
+  memory<dfloat> Iq(mesh.plotNp);
+  memory<dfloat> Iu(mesh.plotNp);
+  memory<dfloat> Iv(mesh.plotNp);
+  memory<dfloat> Iw(mesh.plotNp);
 
   // write out q
   fprintf(fp, "      <PointData Scalars=\"scalars\">\n");
@@ -107,8 +113,6 @@ void gradient_t::PlotFields(){
   fprintf(fp, "       </DataArray>\n");
   fprintf(fp, "     </PointData>\n");
 
-  free(Iq); free(Iu); free(Iv); free(Iw);
-
   fprintf(fp, "    <Cells>\n");
   fprintf(fp, "      <DataArray type=\"Int32\" Name=\"connectivity\" Format=\"ascii\">\n");
 
@@ -149,6 +153,4 @@ void gradient_t::PlotFields(){
   fprintf(fp, "  </UnstructuredGrid>\n");
   fprintf(fp, "</VTKFile>\n");
   fclose(fp);
-
-  free(scratch);
 }
diff --git a/solvers/gradient/src/gradientReport.cpp b/solvers/gradient/src/gradientReport.cpp
index 1d00441d0..220ff2a31 100644
--- a/solvers/gradient/src/gradientReport.cpp
+++ b/solvers/gradient/src/gradientReport.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -32,7 +32,7 @@ void gradient_t::Report(){
   mesh.MassMatrixApply(o_gradq, o_Mgradq);
 
   dlong Nentries = mesh.Nelements*mesh.Np*Nfields;
-  dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_gradq, o_Mgradq, mesh.comm));
+  dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_gradq, o_Mgradq, mesh.comm));
 
   if(mesh.rank==0)
     printf("%5.2f (norm)\n", norm2);
diff --git a/solvers/gradient/src/gradientRun.cpp b/solvers/gradient/src/gradientRun.cpp
index a9121dee9..0f05a878f 100644
--- a/solvers/gradient/src/gradientRun.cpp
+++ b/solvers/gradient/src/gradientRun.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -48,7 +48,7 @@ void gradient_t::Run(){
     mesh.MassMatrixApply(o_gradq, o_Mgradq);
 
     dlong Nentries = mesh.Nelements*mesh.Np*Nfields;
-    dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_gradq, o_Mgradq, mesh.comm));
+    dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_gradq, o_Mgradq, mesh.comm));
 
     if(mesh.rank==0)
       printf("Solution norm = %17.15lg\n", norm2);
diff --git a/solvers/gradient/src/gradientSettings.cpp b/solvers/gradient/src/gradientSettings.cpp
index 0a2da238d..a37b7fe7c 100644
--- a/solvers/gradient/src/gradientSettings.cpp
+++ b/solvers/gradient/src/gradientSettings.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,7 +27,7 @@ SOFTWARE.
 #include "gradient.hpp"
 
 //settings for gradient solver
-gradientSettings_t::gradientSettings_t(MPI_Comm& _comm):
+gradientSettings_t::gradientSettings_t(comm_t _comm):
   settings_t(_comm) {
 
   newSetting("DATA FILE",
@@ -42,10 +42,7 @@ gradientSettings_t::gradientSettings_t(MPI_Comm& _comm):
 
 void gradientSettings_t::report() {
 
-  int rank;
-  MPI_Comm_rank(comm, &rank);
-
-  if (rank==0) {
+  if (comm.rank()==0) {
     std::cout << "Gradient Settings:\n\n";
     reportSetting("DATA FILE");
     reportSetting("OUTPUT TO FILE");
@@ -53,16 +50,16 @@ void gradientSettings_t::report() {
 }
 
 void gradientSettings_t::parseFromFile(platformSettings_t& platformSettings,
-                                  meshSettings_t& meshSettings,
-                                  const string filename) {
+                                       meshSettings_t& meshSettings,
+                                       const std::string filename) {
   //read all settings from file
   settings_t s(comm);
   s.readSettingsFromFile(filename);
 
   for(auto it = s.settings.begin(); it != s.settings.end(); ++it) {
-    setting_t* set = it->second;
-    const string name = set->getName();
-    const string val = set->getVal<string>();
+    setting_t& set = it->second;
+    const std::string name = set.getName();
+    const std::string val = set.getVal<std::string>();
     if (platformSettings.hasSetting(name))
       platformSettings.changeSetting(name, val);
     else if (meshSettings.hasSetting(name))
@@ -70,9 +67,7 @@ void gradientSettings_t::parseFromFile(platformSettings_t& platformSettings,
     else if (hasSetting(name)) //self
       changeSetting(name, val);
     else  {
-      stringstream ss;
-      ss << "Unknown setting: [" << name << "] requested";
-      LIBP_ABORT(ss.str());
+      LIBP_FORCE_ABORT("Unknown setting: [" << name << "] requested");
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/gradient/src/gradientSetup.cpp b/solvers/gradient/src/gradientSetup.cpp
index 0b334ff7f..0b5efb53e 100644
--- a/solvers/gradient/src/gradientSetup.cpp
+++ b/solvers/gradient/src/gradientSetup.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,76 +26,74 @@ SOFTWARE.
 
 #include "gradient.hpp"
 
-gradient_t& gradient_t::Setup(platform_t& platform, mesh_t& mesh,
-                              gradientSettings_t& settings){
+void gradient_t::Setup(platform_t& _platform, mesh_t& _mesh,
+                       gradientSettings_t& _settings){
 
-  gradient_t* gradient = new gradient_t(platform, mesh, settings);
+  platform = _platform;
+  mesh = _mesh;
+  comm = mesh.comm;
+  settings = _settings;
 
-  gradient->Nfields = mesh.dim;
+  Nfields = mesh.dim;
 
   dlong Nlocal = mesh.Nelements*mesh.Np;
 
   //setup linear algebra module
-  platform.linAlg.InitKernels({"innerProd"});
+  platform.linAlg().InitKernels({"innerProd"});
 
   // compute samples of q at interpolation nodes
-  gradient->q = (dfloat*) calloc(Nlocal, sizeof(dfloat));
-  gradient->o_q = platform.malloc(Nlocal*sizeof(dfloat), gradient->q);
+  q.malloc(Nlocal);
+  o_q = platform.malloc<dfloat>(q);
 
-  gradient->gradq = (dfloat*) calloc(Nlocal*mesh.dim, sizeof(dfloat));
-  gradient->o_gradq = platform.malloc(Nlocal*mesh.dim*sizeof(dfloat), gradient->gradq);
+  gradq.malloc(Nlocal*mesh.dim);
+  o_gradq = platform.malloc<dfloat>(gradq);
 
   //storage for M*gradq during reporting
-  gradient->o_Mgradq = platform.malloc(Nlocal*mesh.dim*sizeof(dfloat), gradient->gradq);
-  mesh.MassMatrixKernelSetup(gradient->Nfields); // mass matrix operator
+  o_Mgradq = platform.malloc<dfloat>(gradq);
+  mesh.MassMatrixKernelSetup(Nfields); // mass matrix operator
 
   // OCCA build stuff
-  occa::properties kernelInfo = mesh.props; //copy base occa properties
+  properties_t kernelInfo = mesh.props; //copy base occa properties
 
   //add boundary data to kernel info
-  string dataFileName;
+  std::string dataFileName;
   settings.getSetting("DATA FILE", dataFileName);
   kernelInfo["includes"] += dataFileName;
 
-  kernelInfo["defines/" "p_Nfields"]= gradient->Nfields;
-
-  kernelInfo["parser/" "automate-add-barriers"] =  "disabled";
+  kernelInfo["defines/" "p_Nfields"]= Nfields;
 
   // set kernel name suffix
-  char *suffix;
-  if(mesh.elementType==TRIANGLES)
-    suffix = strdup("Tri2D");
-  if(mesh.elementType==QUADRILATERALS)
-    suffix = strdup("Quad2D");
-  if(mesh.elementType==TETRAHEDRA)
-    suffix = strdup("Tet3D");
-  if(mesh.elementType==HEXAHEDRA)
-    suffix = strdup("Hex3D");
-
-  char fileName[BUFSIZ], kernelName[BUFSIZ];
+  std::string suffix;
+  if(mesh.elementType==Mesh::TRIANGLES)
+    suffix = "Tri2D";
+  if(mesh.elementType==Mesh::QUADRILATERALS)
+    suffix = "Quad2D";
+  if(mesh.elementType==Mesh::TETRAHEDRA)
+    suffix = "Tet3D";
+  if(mesh.elementType==Mesh::HEXAHEDRA)
+    suffix = "Hex3D";
+
+  std::string oklFilePrefix = DGRADIENT "/okl/";
+  std::string oklFileSuffix = ".okl";
+
+  std::string fileName, kernelName;
 
   // kernels from volume file
-  sprintf(fileName, DGRADIENT "/okl/gradientVolume%s.okl", suffix);
-  sprintf(kernelName, "gradientVolume%s", suffix);
+  fileName   = oklFilePrefix + "gradientVolume" + suffix + oklFileSuffix;
+  kernelName = "gradientVolume" + suffix;
 
-  gradient->volumeKernel =  platform.buildKernel(fileName, kernelName,
+  volumeKernel =  platform.buildKernel(fileName, kernelName,
                                          kernelInfo);
 
   if (mesh.dim==2) {
-    sprintf(fileName, DGRADIENT "/okl/gradientInitialCondition2D.okl");
-    sprintf(kernelName, "gradientInitialCondition2D");
+    fileName   = oklFilePrefix + "gradientInitialCondition2D" + oklFileSuffix;
+    kernelName = "gradientInitialCondition2D";
   } else {
-    sprintf(fileName, DGRADIENT "/okl/gradientInitialCondition3D.okl");
-    sprintf(kernelName, "gradientInitialCondition3D");
+    fileName   = oklFilePrefix + "gradientInitialCondition3D" + oklFileSuffix;
+    kernelName = "gradientInitialCondition3D";
   }
 
-  gradient->initialConditionKernel = platform.buildKernel(fileName, kernelName,
+  initialConditionKernel = platform.buildKernel(fileName, kernelName,
                                                   kernelInfo);
 
-  return *gradient;
 }
-
-gradient_t::~gradient_t() {
-  volumeKernel.free();
-  initialConditionKernel.free();
-}
\ No newline at end of file
diff --git a/solvers/ins/data/insBeltrami3D.h b/solvers/ins/data/insBeltrami3D.h
index ebce74e3f..0f30b1bdf 100644
--- a/solvers/ins/data/insBeltrami3D.h
+++ b/solvers/ins/data/insBeltrami3D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/ins/data/insUniform2D.h b/solvers/ins/data/insUniform2D.h
index 3e6146d72..166438443 100644
--- a/solvers/ins/data/insUniform2D.h
+++ b/solvers/ins/data/insUniform2D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/ins/data/insUniform3D.h b/solvers/ins/data/insUniform3D.h
index dda2129a9..eb9344af5 100644
--- a/solvers/ins/data/insUniform3D.h
+++ b/solvers/ins/data/insUniform3D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/ins/data/insVortex2D.h b/solvers/ins/data/insVortex2D.h
index cb7bc1510..9f683e6ad 100644
--- a/solvers/ins/data/insVortex2D.h
+++ b/solvers/ins/data/insVortex2D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/ins/ins.hpp b/solvers/ins/ins.hpp
index 1900f4c9b..649d704cd 100644
--- a/solvers/ins/ins.hpp
+++ b/solvers/ins/ins.hpp
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -38,64 +38,62 @@
 
 #define DINS LIBP_DIR"/solvers/ins/"
 
+using namespace libp;
+
 class insSettings_t: public settings_t {
 public:
-  insSettings_t(MPI_Comm& _comm);
+  insSettings_t(comm_t& _comm);
   void report();
   void parseFromFile(platformSettings_t& platformSettings,
                      meshSettings_t& meshSettings,
-                     const string filename);
+                     const std::string filename);
 
-  ellipticSettings_t* extractVelocitySettings();
-  ellipticSettings_t* extractPressureSettings();
+  ellipticSettings_t extractVelocitySettings();
+  ellipticSettings_t extractPressureSettings();
 };
 
 class ins_t;
 
 class subcycler_t: public solver_t {
 public:
-  mesh_t& mesh;
+  mesh_t mesh;
 
   int cubature;
-  halo_t* vTraceHalo;
-  occa::kernel advectionVolumeKernel;
-  occa::kernel advectionSurfaceKernel;
+  ogs::halo_t vTraceHalo;
+  kernel_t advectionVolumeKernel;
+  kernel_t advectionSurfaceKernel;
 
-  occa::kernel subCycleAdvectionKernel;
+  kernel_t subCycleAdvectionKernel;
 
   int NVfields;
   int order, maxOrder, shiftIndex;
   dfloat nu, T0, dt;
 
-  occa::memory o_Ue, o_Uh;
-
-  subcycler_t() = delete;
-  subcycler_t(ins_t& ins);
+  deviceMemory<dfloat> o_Ue, o_Uh;
 
-  ~subcycler_t(){};
+  subcycler_t() = default;
 
   void Report(dfloat time, int tstep){};
 
-  void rhsf(occa::memory& o_q, occa::memory& o_rhs, const dfloat time);
+  void rhsf(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_rhs, const dfloat time);
 };
 
 class ins_t: public solver_t {
 public:
-  mesh_t& mesh;
-  linAlg_t& linAlg;
-  TimeStepper::timeStepper_t* timeStepper;
+  mesh_t mesh;
+  timeStepper_t timeStepper;
 
-  halo_t* vTraceHalo;
-  halo_t* pTraceHalo;
+  ogs::halo_t vTraceHalo;
+  ogs::halo_t pTraceHalo;
 
-  ellipticSettings_t *vSettings, *pSettings;
-  elliptic_t *uSolver, *vSolver, *wSolver;
-  elliptic_t *pSolver;
+  ellipticSettings_t vSettings, pSettings;
+  elliptic_t uSolver, vSolver, wSolver;
+  elliptic_t pSolver;
 
-  linearSolver_t *uLinearSolver;
-  linearSolver_t *vLinearSolver;
-  linearSolver_t *wLinearSolver;
-  linearSolver_t *pLinearSolver;
+  linearSolver_t uLinearSolver;
+  linearSolver_t vLinearSolver;
+  linearSolver_t wLinearSolver;
+  linearSolver_t pLinearSolver;
 
   int NVfields, NTfields;
 
@@ -108,109 +106,103 @@ class ins_t: public solver_t {
   dfloat nu;
   dfloat vTau, pTau;
 
-  dfloat *u, *p;
-  occa::memory o_u, o_p;
+  memory<dfloat> u, p;
+  deviceMemory<dfloat> o_u, o_p;
 
-  occa::memory o_GU;
+  deviceMemory<dfloat> o_GU;
 
-  occa::memory o_MU;
+  deviceMemory<dfloat> o_MU;
 
-  dfloat *Vort;
-  occa::memory o_Vort;
+  memory<dfloat> Vort;
+  deviceMemory<dfloat> o_Vort;
 
   //extra buffers for solvers
-  occa::memory o_UH, o_VH, o_WH;
-  occa::memory o_rhsU, o_rhsV, o_rhsW;
-  occa::memory o_rhsP, o_PI;
-
-  occa::memory o_GUH, o_GVH, o_GWH;
-  occa::memory o_GrhsU, o_GrhsV, o_GrhsW;
-  occa::memory o_GrhsP, o_GP, o_GPI;
+  deviceMemory<dfloat> o_UH, o_VH, o_WH;
+  deviceMemory<dfloat> o_rhsU, o_rhsV, o_rhsW;
+  deviceMemory<dfloat> o_rhsP, o_PI;
 
-  int *mapB; //node-wise boundary flag
-  occa::memory o_mapB;
+  deviceMemory<dfloat> o_GUH, o_GVH, o_GWH;
+  deviceMemory<dfloat> o_GrhsU, o_GrhsV, o_GrhsW;
+  deviceMemory<dfloat> o_GrhsP, o_GP, o_GPI;
 
   //subcycling
   int Nsubcycles;
-  TimeStepper::timeStepper_t* subStepper;
-  subcycler_t *subcycler;
+  timeStepper_t subStepper;
+  subcycler_t subcycler;
 
-  occa::kernel advectionVolumeKernel;
-  occa::kernel advectionSurfaceKernel;
+  kernel_t advectionVolumeKernel;
+  kernel_t advectionSurfaceKernel;
 
-  occa::kernel divergenceVolumeKernel;
-  occa::kernel divergenceSurfaceKernel;
+  kernel_t divergenceVolumeKernel;
+  kernel_t divergenceSurfaceKernel;
 
-  occa::kernel gradientVolumeKernel;
-  occa::kernel gradientSurfaceKernel;
+  kernel_t gradientVolumeKernel;
+  kernel_t gradientSurfaceKernel;
 
-  occa::kernel velocityGradientKernel;
-  occa::kernel diffusionKernel;
+  kernel_t velocityGradientKernel;
+  kernel_t diffusionKernel;
 
-  occa::kernel velocityRhsKernel;
-  occa::kernel velocityBCKernel;
+  kernel_t velocityRhsKernel;
+  kernel_t velocityBCKernel;
 
-  occa::kernel pressureRhsKernel;
-  occa::kernel pressureBCKernel;
+  kernel_t pressureRhsKernel;
+  kernel_t pressureBCKernel;
 
-  occa::kernel pressureIncrementRhsKernel;
-  occa::kernel pressureIncrementBCKernel;
+  kernel_t pressureIncrementRhsKernel;
+  kernel_t pressureIncrementBCKernel;
 
-  occa::kernel vorticityKernel;
+  kernel_t vorticityKernel;
 
-  occa::kernel initialConditionKernel;
-  occa::kernel maxWaveSpeedKernel;
+  kernel_t initialConditionKernel;
+  kernel_t maxWaveSpeedKernel;
 
-  ins_t() = delete;
+  ins_t() = default;
   ins_t(platform_t &_platform, mesh_t &_mesh,
-              insSettings_t& _settings):
-    solver_t(_platform, _settings), mesh(_mesh), linAlg(platform.linAlg) {}
-
-  ~ins_t();
+        insSettings_t& _settings) {
+    Setup(_platform, _mesh, _settings);
+  }
 
   //setup
-  static ins_t& Setup(platform_t& platform, mesh_t& mesh,
-                      insSettings_t& settings);
-
-  void BoundarySetup();
+  void Setup(platform_t& _platform, mesh_t& _mesh,
+             insSettings_t& _settings);
 
   void Run();
 
   void Report(dfloat time, int tstep);
 
-  void PlotFields(dfloat* U, dfloat* P, dfloat *V, char *fileName);
+  void PlotFields(memory<dfloat>& U, memory<dfloat>& P, memory<dfloat>& V, std::string fileName);
 
-  dfloat MaxWaveSpeed(occa::memory& o_U, const dfloat T);
+  dfloat MaxWaveSpeed(deviceMemory<dfloat>& o_U, const dfloat T);
 
-  // void rhsf(occa::memory& o_q, occa::memory& o_rhs, const dfloat time);
+  // void rhsf(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_rhs, const dfloat time);
 
-  void rhs_imex_f(occa::memory& o_q, occa::memory& o_rhs, const dfloat time);
-  // void rhs_imex_g(occa::memory& o_q, occa::memory& o_rhs, const dfloat time);
+  void rhs_imex_f(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_rhs, const dfloat time);
+  // void rhs_imex_g(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_rhs, const dfloat time);
 
-  void rhs_imex_invg(occa::memory& o_q, occa::memory& o_rhs, const dfloat gamma, const dfloat time);
+  void rhs_imex_invg(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_rhs, const dfloat gamma, const dfloat time);
 
-  void rhs_subcycle_f(occa::memory& o_Q, occa::memory& o_QHAT,
-                           const dfloat T, const dfloat dt, const dfloat* B,
-                           const int order, const int shiftIndex, const int maxOrder);
+  void rhs_subcycle_f(deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_QHAT,
+                      const dfloat T, const dfloat dt, const memory<dfloat> B,
+                      const int order, const int shiftIndex, const int maxOrder);
 
-  void Advection(const dfloat alpha, occa::memory& o_U,
-                 const dfloat beta,  occa::memory& o_RHS,
+  void Advection(const dfloat alpha, deviceMemory<dfloat>& o_U,
+                 const dfloat beta,  deviceMemory<dfloat>& o_RHS,
                  const dfloat T);
-  void Diffusion(const dfloat alpha, occa::memory& o_U,
-                 const dfloat beta,  occa::memory& o_RHS,
+  void Diffusion(const dfloat alpha, deviceMemory<dfloat>& o_U,
+                 const dfloat beta,  deviceMemory<dfloat>& o_RHS,
                  const dfloat T);
-  void Divergence(const dfloat alpha, occa::memory& o_U,
-                 const dfloat beta,  occa::memory& o_RHS,
+  void Divergence(const dfloat alpha, deviceMemory<dfloat>& o_U,
+                 const dfloat beta,  deviceMemory<dfloat>& o_RHS,
                  const dfloat T);
-  void Gradient(const dfloat alpha, occa::memory& o_P,
-                 const dfloat beta,  occa::memory& o_RHS,
+  void Gradient(const dfloat alpha, deviceMemory<dfloat>& o_P,
+                 const dfloat beta,  deviceMemory<dfloat>& o_RHS,
                  const dfloat T);
 
-  void VelocitySolve(occa::memory& o_U, occa::memory& o_RHS,
+  void VelocitySolve(deviceMemory<dfloat>& o_U, deviceMemory<dfloat>& o_RHS,
                      const dfloat gamma, const dfloat T);
-  void PressureSolve(occa::memory& o_P, occa::memory& o_RHS,
+  void PressureSolve(deviceMemory<dfloat>& o_P, deviceMemory<dfloat>& o_RHS,
                      const dfloat gamma, const dfloat T);
-  void PressureIncrementSolve(occa::memory& o_P, occa::memory& o_RHS,
+  void PressureIncrementSolve(deviceMemory<dfloat>& o_P, deviceMemory<dfloat>& o_RHS,
                      const dfloat gamma, const dfloat T, const dfloat dt);
 };
 
diff --git a/solvers/ins/insMain.cpp b/solvers/ins/insMain.cpp
index f49b16251..250c5e115 100644
--- a/solvers/ins/insMain.cpp
+++ b/solvers/ins/insMain.cpp
@@ -1,7 +1,7 @@
 /*
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -28,40 +28,41 @@ SOFTWARE.
 int main(int argc, char **argv){
 
   // start up MPI
-  MPI_Init(&argc, &argv);
+  Comm::Init(argc, argv);
 
-  MPI_Comm comm = MPI_COMM_WORLD;
+  LIBP_ABORT("Usage: ./insMain setupfile", argc!=2);
 
-  if(argc!=2)
-    LIBP_ABORT(string("Usage: ./insMain setupfile"));
+  { /*Scope so everything is destructed before MPI_Finalize */
+    comm_t comm(Comm::World().Dup());
 
-  //create default settings
-  platformSettings_t platformSettings(comm);
-  meshSettings_t meshSettings(comm);
-  insSettings_t insSettings(comm);
+    //create default settings
+    platformSettings_t platformSettings(comm);
+    meshSettings_t meshSettings(comm);
+    insSettings_t insSettings(comm);
 
-  //load settings from file
-  insSettings.parseFromFile(platformSettings, meshSettings,
-                            argv[1]);
+    //load settings from file
+    insSettings.parseFromFile(platformSettings, meshSettings,
+                              argv[1]);
 
-  // set up platform
-  platform_t platform(platformSettings);
+    // set up platform
+    platform_t platform(platformSettings);
 
-  platformSettings.report();
-  meshSettings.report();
-  insSettings.report();
+    platformSettings.report();
+    meshSettings.report();
+    insSettings.report();
 
-  // set up mesh
-  mesh_t& mesh = mesh_t::Setup(platform, meshSettings, comm);
+    // set up mesh
+    mesh_t mesh(platform, meshSettings, comm);
 
-  // set up ins solver
-  ins_t& ins = ins_t::Setup(platform, mesh, insSettings);
+    // set up ins solver
+    ins_t ins(platform, mesh, insSettings);
 
-  // run
-  ins.Run();
+    // run
+    ins.Run();
+  }
 
   // close down MPI
-  MPI_Finalize();
+  Comm::Finalize();
   return LIBP_SUCCESS;
 }
 
diff --git a/solvers/ins/makefile b/solvers/ins/makefile
index e87b7037c..ea73444b0 100644
--- a/solvers/ins/makefile
+++ b/solvers/ins/makefile
@@ -2,7 +2,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
@@ -77,29 +77,25 @@ include ../../make.top
 endif
 endif
 
-#gslib
-GS_DIR=${LIBP_TPL_DIR}/gslib
-
 #libraries
 ELLIPTIC_DIR =${LIBP_DIR}/solvers/elliptic
-FPE_LIBP_LIBS=timeStepper linearSolver parAlmond mesh ogs linAlg core
+FPE_LIBP_LIBS=timeStepper linearSolver parAlmond mesh parAdogs ogs linAlg core
 
 #includes
 INCLUDES=-I${ELLIPTIC_DIR} \
-					${LIBP_INCLUDES} \
-				 -I.
+			${LIBP_INCLUDES} \
+			-I.
 
 #defines
 DEFINES =${LIBP_DEFINES} \
          -DLIBP_DIR='"${LIBP_DIR}"'
 
 #.cpp compilation flags
-INS_CXXFLAGS=${LIBP_MPICXXFLAGS} ${DEFINES} ${INCLUDES}
+INS_CXXFLAGS=${LIBP_CXXFLAGS} ${DEFINES} ${INCLUDES}
 
 #link libraries
 LIBS=-L${ELLIPTIC_DIR} -lelliptic \
-		 -L${LIBP_LIBS_DIR} $(addprefix -l,$(FPE_LIBP_LIBS)) \
-     -L$(GS_DIR)/lib -lgs \
+	  -L${LIBP_LIBS_DIR} $(addprefix -l,$(FPE_LIBP_LIBS)) \
      ${LIBP_LIBS}
 
 #link flags
@@ -155,10 +151,10 @@ endif
 # rule for .cpp files
 %.o: %.cpp $(DEPS) | libelliptic
 ifneq (,${verbose})
-	$(LIBP_MPICXX) -o $*.o -c $*.cpp $(INS_CXXFLAGS)
+	$(LIBP_CXX) -o $*.o -c $*.cpp $(INS_CXXFLAGS)
 else
 	@printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n";
-	@$(LIBP_MPICXX) -o $*.o -c $*.cpp $(INS_CXXFLAGS)
+	@$(LIBP_CXX) -o $*.o -c $*.cpp $(INS_CXXFLAGS)
 endif
 
 #cleanup
@@ -170,8 +166,7 @@ clean-libs: clean
 	${MAKE} -C ${LIBP_LIBS_DIR} clean
 
 clean-kernels: clean-libs
-# 	$(shell ${OCCA_DIR}/bin/occa clear all -y)
-	rm -rf ~/.occa/
+	rm -rf ${LIBP_DIR}/.occa/
 
 realclean: clean
 	${MAKE} -C ${ELLIPTIC_DIR} clean
diff --git a/solvers/ins/okl/insAdvectionHex3D.okl b/solvers/ins/okl/insAdvectionHex3D.okl
index f9efa2cb2..7d1280891 100644
--- a/solvers/ins/okl/insAdvectionHex3D.okl
+++ b/solvers/ins/okl/insAdvectionHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -62,7 +62,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     #pragma unroll p_Nq
     for(int k=0;k<p_Nq;++k){
@@ -103,7 +102,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -127,7 +125,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
     } //k loop
 
     //write out
@@ -241,7 +238,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -258,7 +254,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -271,7 +266,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -295,7 +289,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -308,7 +301,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -332,7 +324,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
diff --git a/solvers/ins/okl/insAdvectionQuad2D.okl b/solvers/ins/okl/insAdvectionQuad2D.okl
index 79cf88a55..f402e2190 100644
--- a/solvers/ins/okl/insAdvectionQuad2D.okl
+++ b/solvers/ins/okl/insAdvectionQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -76,7 +76,6 @@ SOFTWARE.
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -195,7 +194,6 @@ void surfaceTerms(dlong e, int es,
       }
     }
 
-    @barrier("local");
 
     // for all face nodes of all elements
     // face 0 & 2
@@ -217,7 +215,6 @@ void surfaceTerms(dlong e, int es,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -238,7 +235,6 @@ void surfaceTerms(dlong e, int es,
       }
     }
 
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
diff --git a/solvers/ins/okl/insAdvectionQuad3D.okl b/solvers/ins/okl/insAdvectionQuad3D.okl
index 1d9db4fac..65c39d0c2 100644
--- a/solvers/ins/okl/insAdvectionQuad3D.okl
+++ b/solvers/ins/okl/insAdvectionQuad3D.okl
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -103,7 +103,6 @@
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(2)){ 
       for(int j=0;j<p_Nq;++j;@inner(1)){ 
@@ -232,7 +231,6 @@ void surfaceTerms(dlong e, int es, dlong offset, int sk, int face, int i, int j,
       }
     }
 
-    @barrier("local");
 
     // for all face nodes of all elements
     // face 0 & 2
@@ -254,7 +252,6 @@ void surfaceTerms(dlong e, int es, dlong offset, int sk, int face, int i, int j,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -275,7 +272,6 @@ void surfaceTerms(dlong e, int es, dlong offset, int sk, int face, int i, int j,
       }
     }
 
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -356,7 +352,6 @@ void surfaceTerms(dlong e, int es, dlong offset, int sk, int face, int i, int j,
       }
     }
 
-    @barrier("local");
 
     //interpolate in i, store in register
     for(int es=0;es<p_NblockV;++es;@inner(2)){   
@@ -377,7 +372,6 @@ void surfaceTerms(dlong e, int es, dlong offset, int sk, int face, int i, int j,
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int es=0;es<p_NblockV;++es;@inner(2)){   
@@ -392,7 +386,6 @@ void surfaceTerms(dlong e, int es, dlong offset, int sk, int face, int i, int j,
       }
     }
 
-    @barrier("local");
 
     //interpolate in j and store flux in register
     for(int es=0;es<p_NblockV;++es;@inner(2)){   
@@ -411,7 +404,6 @@ void surfaceTerms(dlong e, int es, dlong offset, int sk, int face, int i, int j,
       }
     }
 
-    @barrier("local");
 
     //construct flux from registers
     for(int es=0;es<p_NblockV;++es;@inner(2)){   
@@ -442,7 +434,6 @@ void surfaceTerms(dlong e, int es, dlong offset, int sk, int face, int i, int j,
       }
     }
 
-    @barrier("local");
 
     //project/differentiate in j
     for(int es=0;es<p_NblockV;++es;@inner(2)){   
@@ -470,7 +461,6 @@ void surfaceTerms(dlong e, int es, dlong offset, int sk, int face, int i, int j,
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int es=0;es<p_NblockV;++es;@inner(2)){   
@@ -488,7 +478,6 @@ void surfaceTerms(dlong e, int es, dlong offset, int sk, int face, int i, int j,
       }
     }
 
-    @barrier("local");    
 
     //project/differentiate in i and write back 
     for(int es=0;es<p_NblockV;++es;@inner(2)){ 
@@ -639,7 +628,6 @@ void quadSurfaceTerms(int es, int face, int m, int i, int j,
       }
     }
 
-    @barrier("local");
 
     //interpolate traces, store flux in register
     for(int es=0;es<p_NblockS;++es;@inner(1)){   
@@ -667,7 +655,6 @@ void quadSurfaceTerms(int es, int face, int m, int i, int j,
         }
       }
 
-    @barrier("local"); //need a barrier since s_fluxNU and s_fluxNV are aliased
 
     //write fluxes to @shared
     for(int es=0;es<p_NblockS;++es;@inner(1)){   
@@ -708,7 +695,6 @@ void quadSurfaceTerms(int es, int face, int m, int i, int j,
       }
     }
 
-    @barrier("local");
 
     // for all face nodes of all elements
     // face 0 & 2
@@ -726,7 +712,6 @@ void quadSurfaceTerms(int es, int face, int m, int i, int j,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -743,7 +728,6 @@ void quadSurfaceTerms(int es, int face, int m, int i, int j,
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockS;++es;@inner(1)){
       for(int i=0;i<p_cubNq;++i;@inner(0)){
diff --git a/solvers/ins/okl/insAdvectionTet3D.okl b/solvers/ins/okl/insAdvectionTet3D.okl
index 3180bf429..7b90fbc22 100644
--- a/solvers/ins/okl/insAdvectionTet3D.okl
+++ b/solvers/ins/okl/insAdvectionTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -86,7 +86,6 @@ SOFTWARE.
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
@@ -216,7 +215,6 @@ SOFTWARE.
 
 
     // wait for all flux functions are written to @shared
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
diff --git a/solvers/ins/okl/insAdvectionTri2D.okl b/solvers/ins/okl/insAdvectionTri2D.okl
index 03d1272db..f6a863797 100644
--- a/solvers/ins/okl/insAdvectionTri2D.okl
+++ b/solvers/ins/okl/insAdvectionTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -56,7 +56,6 @@ SOFTWARE.
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
@@ -188,7 +187,6 @@ SOFTWARE.
     }
 
     // wait for all flux functions are written to @shared
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
diff --git a/solvers/ins/okl/insConstrainQuad3D.okl b/solvers/ins/okl/insConstrainQuad3D.okl
index f9f397151..704f658b6 100644
--- a/solvers/ins/okl/insConstrainQuad3D.okl
+++ b/solvers/ins/okl/insConstrainQuad3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/ins/okl/insCubatureAdvectionHex3D.okl b/solvers/ins/okl/insCubatureAdvectionHex3D.okl
index 8ee5da269..cfd11434a 100644
--- a/solvers/ins/okl/insCubatureAdvectionHex3D.okl
+++ b/solvers/ins/okl/insCubatureAdvectionHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -88,7 +88,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //read in and interpolate in k
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -113,7 +112,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     #pragma unroll p_cubNq
     for(int k=0;k<p_cubNq;++k){
@@ -128,7 +126,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -146,7 +143,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -158,7 +154,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -174,7 +169,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
     }
 
     //#pragma unroll p_cubNq
@@ -215,7 +209,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -239,7 +232,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
     } //k loop
 
     #pragma unroll p_cubNq
@@ -253,7 +245,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -271,7 +262,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -283,7 +273,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -301,7 +290,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
     }
 
     //project in k and write out
@@ -369,7 +357,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -389,7 +376,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -404,7 +390,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -422,7 +407,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -457,7 +441,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -473,7 +456,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -484,7 +466,6 @@ SOFTWARE.
         }                                                               \
       }                                                                 \
     }                                                                   \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -500,7 +481,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -578,11 +558,9 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     quadSurfaceTerms(0) //face 0
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -595,11 +573,9 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     quadSurfaceTerms(5) //face 5
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -612,11 +588,9 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     quadSurfaceTerms(1) //face 1
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -632,11 +606,9 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     quadSurfaceTerms(3) //face 3
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -652,11 +624,9 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     quadSurfaceTerms(2) //face 2
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -672,11 +642,9 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     quadSurfaceTerms(4) //face 4
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -692,7 +660,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_cubNq;++j;@inner(1)){
       for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -754,7 +721,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //read in and interpolate in k
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -779,7 +745,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     #pragma unroll p_cubNq
     for(int k=0;k<p_cubNq;++k){
@@ -794,7 +759,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -812,7 +776,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -824,7 +787,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -844,7 +806,6 @@ SOFTWARE.
           cU[id+2*offset] = r_W[k];
         }
       }
-      @barrier("local");
     }
   }
 
@@ -881,7 +842,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     #pragma unroll p_cubNq
     for(int k=0;k<p_cubNq;++k){
@@ -922,7 +882,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -946,7 +905,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
     } //k loop
 
     //write out
@@ -990,7 +948,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //read in and project in k
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -1013,7 +970,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     #pragma unroll p_Nq
     for(int k=0;k<p_Nq;++k){
@@ -1026,7 +982,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -1044,7 +999,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -1056,7 +1010,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -1081,7 +1034,6 @@ SOFTWARE.
           }
         }
       }
-      @barrier("local");
     }
   }
 }
@@ -1122,7 +1074,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int k=0;k<p_Nq;++k;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -1187,4 +1138,4 @@ SOFTWARE.
     }
   }
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/solvers/ins/okl/insCubatureAdvectionQuad2D.okl b/solvers/ins/okl/insCubatureAdvectionQuad2D.okl
index 843f85cec..fa0ece440 100644
--- a/solvers/ins/okl/insCubatureAdvectionQuad2D.okl
+++ b/solvers/ins/okl/insCubatureAdvectionQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -77,7 +77,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //interpolate in i, store in register
     for(int es=0;es<p_cubNblockV;++es;@inner(2)){
@@ -97,7 +96,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int es=0;es<p_cubNblockV;++es;@inner(2)){
@@ -111,7 +109,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //interpolate in j and store flux in register
     for(int es=0;es<p_cubNblockV;++es;@inner(2)){
@@ -129,7 +126,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //construct flux from registers
     for(int es=0;es<p_cubNblockV;++es;@inner(2)){
@@ -156,7 +152,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //project/differentiate in j
     for(int es=0;es<p_cubNblockV;++es;@inner(2)){
@@ -180,7 +175,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int es=0;es<p_cubNblockV;++es;@inner(2)){
@@ -196,7 +190,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //project/differentiate in i and write back
     for(int es=0;es<p_cubNblockV;++es;@inner(2)){
@@ -348,7 +341,6 @@ void quadSurfaceTerms(int es, int face, int m, int i, int j,
       }
     }
 
-    @barrier("local");
 
     //interpolate traces, store flux in register
     for(int es=0;es<p_cubNblockS;++es;@inner(1)){
@@ -374,7 +366,6 @@ void quadSurfaceTerms(int es, int face, int m, int i, int j,
       }
     }
 
-    @barrier("local"); //need a barrier since s_fluxNU and s_fluxNV are aliased
 
     //write fluxes to @shared
     for(int es=0;es<p_cubNblockS;++es;@inner(1)){
@@ -411,7 +402,6 @@ void quadSurfaceTerms(int es, int face, int m, int i, int j,
       }
     }
 
-    @barrier("local");
 
     // for all face nodes of all elements
     // face 0 & 2
@@ -429,7 +419,6 @@ void quadSurfaceTerms(int es, int face, int m, int i, int j,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int es=0;es<p_cubNblockS;++es;@inner(1)){
@@ -446,7 +435,6 @@ void quadSurfaceTerms(int es, int face, int m, int i, int j,
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_cubNblockS;++es;@inner(1)){
       for(int i=0;i<p_cubNq;++i;@inner(0)){
diff --git a/solvers/ins/okl/insCubatureAdvectionTet3D.okl b/solvers/ins/okl/insCubatureAdvectionTet3D.okl
index d7284a641..f5772064d 100644
--- a/solvers/ins/okl/insCubatureAdvectionTet3D.okl
+++ b/solvers/ins/okl/insCubatureAdvectionTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -68,7 +68,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
 
     for(int es=0;es<p_cubNblockV;++es;@inner(1)){// for all elements in block
@@ -116,7 +115,6 @@ SOFTWARE.
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_cubNblockV;++es;@inner(1)){// for all elements in block
       for(int n=0;n<p_cubNp;++n;@inner(0)){     // for all nodes in this element
@@ -222,7 +220,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // interpolate to surface integration nodes
     for(int es=0;es<p_cubNblockS;++es;@inner(1)){
@@ -288,7 +285,6 @@ SOFTWARE.
     }
 
     // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
 
     // lift from surface integration to volume nodes
     for(int es=0;es<p_cubNblockS;++es;@inner(1)){
diff --git a/solvers/ins/okl/insCubatureAdvectionTri2D.okl b/solvers/ins/okl/insCubatureAdvectionTri2D.okl
index 6154f1754..ed46f4c8d 100644
--- a/solvers/ins/okl/insCubatureAdvectionTri2D.okl
+++ b/solvers/ins/okl/insCubatureAdvectionTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -59,7 +59,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_cubNblockV;++es;@inner(1)){// for all elements in block
       for(int n=0;n<p_cubNp;++n;@inner(0)){     // for all nodes in this element
@@ -81,7 +80,6 @@ SOFTWARE.
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_cubNblockV;++es;@inner(1)){// for all elements in block
       for(int n=0;n<p_cubNp;++n;@inner(0)){     // for all nodes in this element
@@ -186,7 +184,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // interpolate to surface integration nodes
     for(int es=0;es<p_cubNblockS;++es;@inner(1)){
@@ -239,7 +236,6 @@ SOFTWARE.
     }
 
     // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
 
     // lift from surface integration to volume nodes
     for(int es=0;es<p_cubNblockS;++es;@inner(1)){
diff --git a/solvers/ins/okl/insDiffusionHex3D.okl b/solvers/ins/okl/insDiffusionHex3D.okl
index 4d41b2fe9..b702def04 100644
--- a/solvers/ins/okl/insDiffusionHex3D.okl
+++ b/solvers/ins/okl/insDiffusionHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -119,7 +119,6 @@ SOFTWARE.
     #pragma unroll p_NVfields
     for (int fld=0; fld<p_NVfields; fld++) {
 
-      @barrier("local");
 
       // loop over slabs
       for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -143,7 +142,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       // face 0 & 5
       for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -156,7 +154,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       // face 0 & 5
       for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -175,7 +172,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       // face 1 & 3
       for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -188,7 +184,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       // face 1 & 3
       for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -214,7 +209,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       // face 2 & 4
       for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -228,7 +222,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       // face 2 & 4
       for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -254,7 +247,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       //layer by layer
       #pragma unroll p_Nq
@@ -286,7 +278,6 @@ SOFTWARE.
           }
         }
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -303,7 +294,6 @@ SOFTWARE.
           }
         }
 
-        @barrier("local");
       }
 
       for(int j=0;j<p_Nq;++j;@inner(1)){
diff --git a/solvers/ins/okl/insDiffusionQuad2D.okl b/solvers/ins/okl/insDiffusionQuad2D.okl
index b3bcab1d3..ed0bd0c84 100644
--- a/solvers/ins/okl/insDiffusionQuad2D.okl
+++ b/solvers/ins/okl/insDiffusionQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -110,7 +110,6 @@ SOFTWARE.
     #pragma unroll p_NVfields
     for (int fld=0; fld<p_NVfields; fld++) {
 
-      @barrier("local");
 
       // loop over slabs
       for(int j=0;j<p_Nq;++j){
@@ -131,7 +130,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       // loop over faces to add pseudo-gradient
 
@@ -144,7 +142,6 @@ SOFTWARE.
         surfaceTerms(sk2,2,i,p_Nq-1);
       }
 
-      @barrier("local");
 
       // face 1 & 3
       for(int j=0;j<p_Nq;++j;@inner(0)){
@@ -155,7 +152,6 @@ SOFTWARE.
         surfaceTerms(sk3,3,0,j);
       }
 
-      @barrier("local");
 
       // prescale by geofacs
       for(int j=0;j<p_Nq;++j){
@@ -177,7 +173,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       // loop over slabs
       for(int j=0;j<p_Nq;++j){
diff --git a/solvers/ins/okl/insDiffusionQuad3D.okl b/solvers/ins/okl/insDiffusionQuad3D.okl
index ab6ba626e..fa6abb2ba 100644
--- a/solvers/ins/okl/insDiffusionQuad3D.okl
+++ b/solvers/ins/okl/insDiffusionQuad3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,50 +25,50 @@ SOFTWARE.
 */
 
 
-#define surfaceTerms(sk,face,i, j)					\
+#define surfaceTerms(sk,face,i, j)                                      \
   {                                                                     \
-    const dlong  idM = vmapM[sk];					\
-    const dlong  idP = vmapP[sk];					\
+    const dlong  idM = vmapM[sk];                                       \
+    const dlong  idP = vmapP[sk];                                       \
                                                                         \
-    const dfloat nx = sgeo[sk*p_Nsgeo+p_NXID];				\
-    const dfloat ny = sgeo[sk*p_Nsgeo+p_NYID];				\
-    const dfloat nz = sgeo[sk*p_Nsgeo+p_NZID];				\
-    const dfloat WsJ = sgeo[sk*p_Nsgeo+p_WSJID];			\
-    const dfloat hinv= sgeo[sk*p_Nsgeo+p_IHID];				\
+    const dfloat nx = sgeo[sk*p_Nsgeo+p_NXID];                          \
+    const dfloat ny = sgeo[sk*p_Nsgeo+p_NYID];                          \
+    const dfloat nz = sgeo[sk*p_Nsgeo+p_NZID];                          \
+    const dfloat WsJ = sgeo[sk*p_Nsgeo+p_WSJID];                        \
+    const dfloat hinv= sgeo[sk*p_Nsgeo+p_IHID];                         \
                                                                         \
-    const dfloat4 gradqM = GU[idM+fld*fieldOffset];			\
-    dfloat4 gradqP = GU[idP+fld*fieldOffset];				\
+    const dfloat4 gradqM = GU[idM+fld*fieldOffset];                     \
+    dfloat4 gradqP = GU[idP+fld*fieldOffset];                           \
                                                                         \
-    dfloat dq = gradqP.w - gradqM.w;					\
+    dfloat dq = gradqP.w - gradqM.w;                                    \
                                                                         \
-    s_dqdx[j][i] += 0.5f*WsJ*nx*dq;					\
-    s_dqdy[j][i] += 0.5f*WsJ*ny*dq;					\
-    s_dqdz[j][i] += 0.5f*WsJ*nz*dq;					\
+    s_dqdx[j][i] += 0.5f*WsJ*nx*dq;                                     \
+    s_dqdy[j][i] += 0.5f*WsJ*ny*dq;                                     \
+    s_dqdz[j][i] += 0.5f*WsJ*nz*dq;                                     \
                                                                         \
     s_Lq[j][i] -= 0.5f*WsJ*(nx*(gradqP.x+gradqM.x) +                    \
                             ny*(gradqP.y+gradqM.y) +                    \
-			    nz*(gradqP.z+gradqM.z) +                    \
+                            nz*(gradqP.z+gradqM.z) +                    \
                             tau*dq*hinv);                               \
   }
 
 @kernel void insDiffusionIpdgQuad2D(const dlong Nelements,
-				    @restrict const  dlong *  elementList,
-				    @restrict const  dlong *  vmapM,
-				    @restrict const  dlong *  vmapP,
-				    const dfloat nu,
-				    const dfloat tau,
-				    @restrict const  dfloat *  vgeo,
-				    @restrict const  dfloat *  sgeo,
-				    @restrict const  int   *  EToB,
-				    const dfloat time,
-				    @restrict const  dfloat *  x,
-				    @restrict const  dfloat *  y,
-				    @restrict const  dfloat *  z,
-				    const dlong fieldOffset,
-				    @restrict const  dfloat *  D,
-				    @restrict const  dfloat *  LIFTT,
-				    @restrict const  dfloat4 *  GU,
-				    @restrict dfloat  *  LU){
+                                    @restrict const  dlong *  elementList,
+                                    @restrict const  dlong *  vmapM,
+                                    @restrict const  dlong *  vmapP,
+                                    const dfloat nu,
+                                    const dfloat tau,
+                                    @restrict const  dfloat *  vgeo,
+                                    @restrict const  dfloat *  sgeo,
+                                    @restrict const  int   *  EToB,
+                                    const dfloat time,
+                                    @restrict const  dfloat *  x,
+                                    @restrict const  dfloat *  y,
+                                    @restrict const  dfloat *  z,
+                                    const dlong fieldOffset,
+                                    @restrict const  dfloat *  D,
+                                    @restrict const  dfloat *  LIFTT,
+                                    @restrict const  dfloat4 *  GU,
+                                    @restrict dfloat  *  LU){
 
   for(dlong e=0;e<Nelements;++e;@outer(0)){
 
@@ -83,7 +83,6 @@ SOFTWARE.
 #pragma unroll p_NVfields
     for (int fld=0; fld<p_NVfields; fld++) {
 
-      @barrier("local");
 
       // loop over slabs
       for(int j=0;j<p_Nq;++j){
@@ -98,14 +97,13 @@ SOFTWARE.
 
           s_dqdx[j][i] = JW*gradqn.x;
           s_dqdy[j][i] = JW*gradqn.y;
-	        s_dqdz[j][i] = JW*gradqn.z;
+                s_dqdz[j][i] = JW*gradqn.z;
           s_Lq[j][i] = 0.0;
 
           s_D[j][i] = D[j*p_Nq+i];
         }
       }
 
-      @barrier("local");
 
       // loop over faces to add pseudo-gradient
 
@@ -118,7 +116,6 @@ SOFTWARE.
         surfaceTerms(sk2,2,i,p_Nq-1);
       }
 
-      @barrier("local");
 
       // face 1 & 3
       for(int j=0;j<p_Nq;++j;@inner(0)){
@@ -129,7 +126,6 @@ SOFTWARE.
         surfaceTerms(sk3,3,0,j);
       }
 
-      @barrier("local");
 
       // prescale by geofacs
       for(int j=0;j<p_Nq;++j){
@@ -154,11 +150,10 @@ SOFTWARE.
 
           s_dqdx[j][i] = (drdx*dqdx + drdy*dqdy + drdz*dqdz);
           s_dqdy[j][i] = (dsdx*dqdx + dsdy*dqdy + dsdz*dqdz);
-	        s_dqdz[j][i] = (dtdx*dqdx + dtdy*dqdy + dtdz*dqdz);
+                s_dqdz[j][i] = (dtdx*dqdx + dtdy*dqdy + dtdz*dqdz);
         }
       }
 
-      @barrier("local");
 
       // loop over slabs
       for(int j=0;j<p_Nq;++j){
diff --git a/solvers/ins/okl/insDiffusionTet3D.okl b/solvers/ins/okl/insDiffusionTet3D.okl
index 100d23be8..8381e6fb4 100644
--- a/solvers/ins/okl/insDiffusionTet3D.okl
+++ b/solvers/ins/okl/insDiffusionTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -64,7 +64,6 @@ SOFTWARE.
     //#pragma unroll p_NVfields
     for (int fld=0; fld<p_NVfields; fld++) {
 
-      @barrier("local");
 
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
         element = elementList[e];
@@ -147,7 +146,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       // dqdx += LIFT*(sJ/J)*nx*dq
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
@@ -189,7 +187,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
         if(n<p_NfacesNfp){
@@ -198,7 +195,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
         if(n<p_Np){
diff --git a/solvers/ins/okl/insDiffusionTri2D.okl b/solvers/ins/okl/insDiffusionTri2D.okl
index 30093d781..e6fb2c65b 100644
--- a/solvers/ins/okl/insDiffusionTri2D.okl
+++ b/solvers/ins/okl/insDiffusionTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -61,7 +61,6 @@ SOFTWARE.
     #pragma unroll p_NVfields
     for (int i=0; i<p_NVfields; i++) {
 
-      @barrier("local");
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
         element = elementList[e];
 
@@ -139,7 +138,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       // dqdx += LIFT*(sJ/J)*nx*dq
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
@@ -170,7 +168,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
         if(n<p_NfacesNfp){
@@ -179,7 +176,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
         if(n<p_Np){
diff --git a/solvers/ins/okl/insDivergenceHex3D.okl b/solvers/ins/okl/insDivergenceHex3D.okl
index 28cbea7db..3657b65e6 100644
--- a/solvers/ins/okl/insDivergenceHex3D.okl
+++ b/solvers/ins/okl/insDivergenceHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -53,7 +53,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     #pragma unroll p_Nq
     for(int k=0;k<p_Nq;++k){
@@ -85,7 +84,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -101,7 +99,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
     } //k loop
 
     //write out
@@ -196,7 +193,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -209,7 +205,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -222,7 +217,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -242,7 +236,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -255,7 +248,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -275,7 +267,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -328,7 +319,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int k=0;k<p_Nq;++k;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
diff --git a/solvers/ins/okl/insDivergenceQuad2D.okl b/solvers/ins/okl/insDivergenceQuad2D.okl
index 8d97e1e95..85482e622 100644
--- a/solvers/ins/okl/insDivergenceQuad2D.okl
+++ b/solvers/ins/okl/insDivergenceQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -57,7 +57,6 @@ SOFTWARE.
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -161,7 +160,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // for all face nodes of all elements
     // face 0 & 2
@@ -178,7 +176,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -194,7 +191,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // for each node in the element
     for(int j=0;j<p_Nq;++j){
diff --git a/solvers/ins/okl/insDivergenceQuad3D.okl b/solvers/ins/okl/insDivergenceQuad3D.okl
index dd71592c7..b1dab3fcb 100644
--- a/solvers/ins/okl/insDivergenceQuad3D.okl
+++ b/solvers/ins/okl/insDivergenceQuad3D.okl
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -26,11 +26,11 @@
 
 // Compute Gradient in Strong Form
 @kernel void insDivergenceVolumeQuad3D(const dlong Nelements,
-				       @restrict const  dfloat *  vgeo,
-				       @restrict const  dfloat *  D,
-				       const dlong offset,
-				       @restrict const  dfloat *  U,
-				       @restrict dfloat *  divU){
+                                       @restrict const  dfloat *  vgeo,
+                                       @restrict const  dfloat *  D,
+                                       const dlong offset,
+                                       @restrict const  dfloat *  U,
+                                       @restrict dfloat *  divU){
 
   for(dlong eo=0;eo<Nelements;eo+=p_NblockV;@outer(0)){  // for all elements
 
@@ -39,31 +39,30 @@
     @shared dfloat s_W[p_NblockV][p_Nq][p_Nq];
     @shared dfloat s_D[p_Nq][p_Nq];
 
-    for(int es=0;es<p_NblockV;++es;@inner(2)){   
-      for(int j=0;j<p_Nq;++j;@inner(1)){ 
-        for(int i=0;i<p_Nq;++i;@inner(0)){    
+    for(int es=0;es<p_NblockV;++es;@inner(2)){
+      for(int j=0;j<p_Nq;++j;@inner(1)){
+        for(int i=0;i<p_Nq;++i;@inner(0)){
           const dlong e = eo+es; // element in block
-          if(e<Nelements){ 
+          if(e<Nelements){
             const dlong id = e*p_Np + j*p_Nq+i;
-            
+
             s_U[es][j][i] = U[id+0*offset];
             s_V[es][j][i] = U[id+1*offset];
             s_W[es][j][i] = U[id+2*offset];
-	  }
-	  if (es==0)
-	    s_D[j][i] = D[j*p_Nq+i];
+          }
+          if (es==0)
+            s_D[j][i] = D[j*p_Nq+i];
         }
       }
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
-    for(int es=0;es<p_NblockV;++es;@inner(2)){ 
-      for(int j=0;j<p_Nq;++j;@inner(1)){ 
-        for(int i=0;i<p_Nq;++i;@inner(0)){  
-          const dlong e = eo+es; 
-          if(e<Nelements){ 
+    for(int es=0;es<p_NblockV;++es;@inner(2)){
+      for(int j=0;j<p_Nq;++j;@inner(1)){
+        for(int i=0;i<p_Nq;++i;@inner(0)){
+          const dlong e = eo+es;
+          if(e<Nelements){
             const dlong gid = e*p_Np*p_Nvgeo+ j*p_Nq +i;
 
             const dfloat drdx = vgeo[gid + p_RXID*p_Np];
@@ -81,16 +80,16 @@
             dfloat dWdr  = 0.f, dWds  = 0.f;
 
 #pragma unroll p_Nq
-	    for(int n=0;n<p_Nq;++n){
-	      const dfloat Dr = s_D[i][n];
-	      const dfloat Ds = s_D[j][n];
-	      dUdr += Dr*s_U[es][j][n];
-	      dUds += Ds*s_U[es][n][i];
-	      dVdr += Dr*s_V[es][j][n];
-	      dVds += Ds*s_V[es][n][i];
-	      dWdr += Dr*s_W[es][j][n];
-	      dWds += Ds*s_W[es][n][i];
-	    }
+            for(int n=0;n<p_Nq;++n){
+              const dfloat Dr = s_D[i][n];
+              const dfloat Ds = s_D[j][n];
+              dUdr += Dr*s_U[es][j][n];
+              dUds += Ds*s_U[es][n][i];
+              dVdr += Dr*s_V[es][j][n];
+              dVds += Ds*s_V[es][n][i];
+              dWdr += Dr*s_W[es][j][n];
+              dWds += Ds*s_W[es][n][i];
+            }
 
             const dlong id = e*p_Np + j*p_Nq + i;
             // const dfloat dUdx = drdx*dUdr + dsdx*dUds + dtdx*s_U[es][j][i];
@@ -105,41 +104,41 @@
         }
       }
     }
-  }  
+  }
 }
 
 
 #define surfaceTerms(sk,face,i, j)                                      \
   {                                                                     \
-    const dlong idM = vmapM[sk];					\
-    const dlong idP = vmapP[sk];					\
+    const dlong idM = vmapM[sk];                                        \
+    const dlong idP = vmapP[sk];                                        \
+                                                                        \
+    const dfloat nx = sgeo[sk*p_Nsgeo+p_NXID];                          \
+    const dfloat ny = sgeo[sk*p_Nsgeo+p_NYID];                          \
+    const dfloat nz = sgeo[sk*p_Nsgeo+p_NZID];                          \
+    const dfloat sJ = sgeo[sk*p_Nsgeo+p_SJID];                          \
+    const dfloat invWJ = sgeo[sk*p_Nsgeo+p_WIJID];                      \
                                                                         \
-    const dfloat nx = sgeo[sk*p_Nsgeo+p_NXID];				\
-    const dfloat ny = sgeo[sk*p_Nsgeo+p_NYID];				\
-    const dfloat nz = sgeo[sk*p_Nsgeo+p_NZID];				\
-    const dfloat sJ = sgeo[sk*p_Nsgeo+p_SJID];				\
-    const dfloat invWJ = sgeo[sk*p_Nsgeo+p_WIJID];			\
-    									\
     const dfloat uM = U[idM+0*offset], vM = U[idM+1*offset], wM = U[idM+2*offset]; \
     dfloat uP = U[idP+0*offset], vP = U[idP+1*offset], wP = U[idP+2*offset]; \
-    									\
-    const dfloat sc = invWJ * sJ;					\
-    s_fluxU[es][j][i] += 0.5f*sc*(nx*(uP-uM) + ny*(vP-vM) + nz*(wP-wM));	\
+                                                                        \
+    const dfloat sc = invWJ * sJ;                                       \
+    s_fluxU[es][j][i] += 0.5f*sc*(nx*(uP-uM) + ny*(vP-vM) + nz*(wP-wM));        \
   }
 
 @kernel void insDivergenceSurfaceQuad3D(const dlong Nelements,
-					@restrict const  dfloat *  sgeo,
-					@restrict const  dfloat *  LIFTT,
-					@restrict const  dlong  *  vmapM,
-					@restrict const  dlong  *  vmapP,
-					@restrict const  int    *  EToB,
-					const dfloat time,
-					@restrict const  dfloat *  x,
-					@restrict const  dfloat *  y,
-					@restrict const  dfloat *  z,
-					const dlong offset,
-					@restrict const  dfloat *  U,
-					@restrict dfloat *  divU){
+                                        @restrict const  dfloat *  sgeo,
+                                        @restrict const  dfloat *  LIFTT,
+                                        @restrict const  dlong  *  vmapM,
+                                        @restrict const  dlong  *  vmapP,
+                                        @restrict const  int    *  EToB,
+                                        const dfloat time,
+                                        @restrict const  dfloat *  x,
+                                        @restrict const  dfloat *  y,
+                                        @restrict const  dfloat *  z,
+                                        const dlong offset,
+                                        @restrict const  dfloat *  U,
+                                        @restrict dfloat *  divU){
 
   // for all elements
   for(dlong eo=0;eo<Nelements;eo+=p_NblockS;@outer(0)){
@@ -154,7 +153,6 @@
       }
     }
 
-    @barrier("local");
 
     // for all face nodes of all elements
     // face 0 & 2
@@ -171,7 +169,6 @@
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -187,7 +184,6 @@
       }
     }
 
-    @barrier("local");
 
     // for each node in the element
     for(int j=0;j<p_Nq;++j){
diff --git a/solvers/ins/okl/insDivergenceTet3D.okl b/solvers/ins/okl/insDivergenceTet3D.okl
index d582754dc..851bcd854 100644
--- a/solvers/ins/okl/insDivergenceTet3D.okl
+++ b/solvers/ins/okl/insDivergenceTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -53,7 +53,6 @@ SOFTWARE.
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(1)){// for all elements in block
       for(int n=0;n<p_Np;++n;@inner(0)){     // for all nodes in this element
@@ -181,7 +180,6 @@ SOFTWARE.
     }
 
     // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
diff --git a/solvers/ins/okl/insDivergenceTri2D.okl b/solvers/ins/okl/insDivergenceTri2D.okl
index 5f729fe3a..7b9069f5c 100644
--- a/solvers/ins/okl/insDivergenceTri2D.okl
+++ b/solvers/ins/okl/insDivergenceTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -51,7 +51,6 @@ SOFTWARE.
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(1)){// for all elements in block
       for(int n=0;n<p_Np;++n;@inner(0)){     // for all nodes in this element
@@ -157,7 +156,6 @@ SOFTWARE.
     }
 
     // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
diff --git a/solvers/ins/okl/insGradientHex3D.okl b/solvers/ins/okl/insGradientHex3D.okl
index f672e3191..52a87d693 100644
--- a/solvers/ins/okl/insGradientHex3D.okl
+++ b/solvers/ins/okl/insGradientHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -52,7 +52,6 @@ SOFTWARE.
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int k=0;k<p_Nq;++k;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -159,7 +158,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // for all face nodes of all elements
     // face 0 & 5
@@ -173,7 +171,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -190,7 +187,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -203,7 +199,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -227,7 +222,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -240,7 +234,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -264,7 +257,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
diff --git a/solvers/ins/okl/insGradientQuad2D.okl b/solvers/ins/okl/insGradientQuad2D.okl
index 93a0f57ff..e31d3658f 100644
--- a/solvers/ins/okl/insGradientQuad2D.okl
+++ b/solvers/ins/okl/insGradientQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -55,7 +55,6 @@ SOFTWARE.
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -149,7 +148,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // for all face nodes of all elements
     // face 0 & 2
@@ -166,7 +164,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -182,7 +179,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // for each node in the element
     for(int j=0;j<p_Nq;++j){
diff --git a/solvers/ins/okl/insGradientQuad3D.okl b/solvers/ins/okl/insGradientQuad3D.okl
index f69efce48..2cd732c43 100644
--- a/solvers/ins/okl/insGradientQuad3D.okl
+++ b/solvers/ins/okl/insGradientQuad3D.okl
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -26,51 +26,50 @@
 
 // Compute Gradient in Strong Form
 @kernel void insGradientVolumeQuad3D(const dlong Nelements,
-				     @restrict const  dfloat *  vgeo,
-				     @restrict const  dfloat *  D,
-				     const dlong fieldOffset,
-				     @restrict const  dfloat *  P,
-				     @restrict dfloat *  GP){
+                                     @restrict const  dfloat *  vgeo,
+                                     @restrict const  dfloat *  D,
+                                     const dlong fieldOffset,
+                                     @restrict const  dfloat *  P,
+                                     @restrict dfloat *  GP){
 
   for(dlong eo=0;eo<Nelements;eo+=p_NblockV;@outer(0)){  // for all elements
 
     @shared dfloat s_P[p_NblockV][p_Nq][p_Nq];
     @shared dfloat s_D[p_Nq][p_Nq];
 
-    for(int es=0;es<p_NblockV;++es;@inner(2)){   
-      for(int j=0;j<p_Nq;++j;@inner(1)){ 
-        for(int i=0;i<p_Nq;++i;@inner(0)){    
+    for(int es=0;es<p_NblockV;++es;@inner(2)){
+      for(int j=0;j<p_Nq;++j;@inner(1)){
+        for(int i=0;i<p_Nq;++i;@inner(0)){
           const dlong e = eo+es; // element in block
-          if(e<Nelements){ 
+          if(e<Nelements){
             const dlong id = e*p_Np + j*p_Nq+i;
 
             s_P[es][j][i]  = P[id];
-	  }
-	  if (es==0){
-	    s_D[j][i] = D[j*p_Nq+i];
-	  }
+          }
+          if (es==0){
+            s_D[j][i] = D[j*p_Nq+i];
+          }
         }
       }
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
-    for(int es=0;es<p_NblockV;++es;@inner(2)){ 
-      for(int j=0;j<p_Nq;++j;@inner(1)){ 
-        for(int i=0;i<p_Nq;++i;@inner(0)){  
-          const dlong e = eo+es; 
-          if(e<Nelements){ 
+    for(int es=0;es<p_NblockV;++es;@inner(2)){
+      for(int j=0;j<p_Nq;++j;@inner(1)){
+        for(int i=0;i<p_Nq;++i;@inner(0)){
+          const dlong e = eo+es;
+          if(e<Nelements){
             const dlong gid = e*p_Np*p_Nvgeo+ j*p_Nq +i;
-            
+
             const dfloat drdx = vgeo[gid + p_RXID*p_Np];
             const dfloat drdy = vgeo[gid + p_RYID*p_Np];
             const dfloat drdz = vgeo[gid + p_RZID*p_Np];
-            
+
             const dfloat dsdx = vgeo[gid + p_SXID*p_Np];
             const dfloat dsdy = vgeo[gid + p_SYID*p_Np];
             const dfloat dsdz = vgeo[gid + p_SZID*p_Np];
-            
+
             // const dfloat dtdx = vgeo[gid + p_TXID*p_Np];
             // const dfloat dtdy = vgeo[gid + p_TYID*p_Np];
             // const dfloat dtdz = vgeo[gid + p_TZID*p_Np];
@@ -97,109 +96,106 @@
         }
       }
     }
-  }  
+  }
 }
 
 
-#define surfaceTerms(sk,face,i, j)			\
-  {							\
-    const dlong idM = vmapM[sk];			\
-    const dlong idP = vmapP[sk];			\
-    							\
-    const dfloat nx = sgeo[sk*p_Nsgeo+p_NXID];		\
-    const dfloat ny = sgeo[sk*p_Nsgeo+p_NYID];		\
-    const dfloat nz = sgeo[sk*p_Nsgeo+p_NZID];		\
-    const dfloat sJ = sgeo[sk*p_Nsgeo+p_SJID];		\
-    const dfloat invWJ = sgeo[sk*p_Nsgeo+p_WIJID];	\
-							\
-    const dfloat  PM = P[idM];				\
-    dfloat  PP = P[idP];				\
-							\
-    const dfloat sc = invWJ * sJ;			\
-    s_fluxPx[es][j][i] += sc*(.5f*nx*(PP-PM));		\
-    s_fluxPy[es][j][i] += sc*(.5f*ny*(PP-PM));		\
-    s_fluxPz[es][j][i] += sc*(.5f*nz*(PP-PM));		\
+#define surfaceTerms(sk,face,i, j)                      \
+  {                                                     \
+    const dlong idM = vmapM[sk];                        \
+    const dlong idP = vmapP[sk];                        \
+                                                        \
+    const dfloat nx = sgeo[sk*p_Nsgeo+p_NXID];          \
+    const dfloat ny = sgeo[sk*p_Nsgeo+p_NYID];          \
+    const dfloat nz = sgeo[sk*p_Nsgeo+p_NZID];          \
+    const dfloat sJ = sgeo[sk*p_Nsgeo+p_SJID];          \
+    const dfloat invWJ = sgeo[sk*p_Nsgeo+p_WIJID];      \
+                                                        \
+    const dfloat  PM = P[idM];                          \
+    dfloat  PP = P[idP];                                \
+                                                        \
+    const dfloat sc = invWJ * sJ;                       \
+    s_fluxPx[es][j][i] += sc*(.5f*nx*(PP-PM));          \
+    s_fluxPy[es][j][i] += sc*(.5f*ny*(PP-PM));          \
+    s_fluxPz[es][j][i] += sc*(.5f*nz*(PP-PM));          \
   }
 
 @kernel void insGradientSurfaceQuad3D(const dlong Nelements,
-				      @restrict const  dfloat *  sgeo,
-				      @restrict const  dfloat *  LIFTT,
-				      @restrict const  dlong  *  vmapM,
-				      @restrict const  dlong  *  vmapP,
-				      @restrict const  int   *  EToB,
-				      @restrict const  dfloat *  x,
-				      @restrict const  dfloat *  y,
-				      @restrict const  dfloat *  z,
-				      const dfloat time,
-				      const dlong fieldOffset,
-				      @restrict const  dfloat *  P,
-				      @restrict dfloat *  GP){
-  
+                                      @restrict const  dfloat *  sgeo,
+                                      @restrict const  dfloat *  LIFTT,
+                                      @restrict const  dlong  *  vmapM,
+                                      @restrict const  dlong  *  vmapP,
+                                      @restrict const  int   *  EToB,
+                                      @restrict const  dfloat *  x,
+                                      @restrict const  dfloat *  y,
+                                      @restrict const  dfloat *  z,
+                                      const dfloat time,
+                                      const dlong fieldOffset,
+                                      @restrict const  dfloat *  P,
+                                      @restrict dfloat *  GP){
+
   // for all elements
   for(dlong eo=0;eo<Nelements;eo+=p_NblockS;@outer(0)){
     // @shared storage for flux terms
     @shared dfloat s_fluxPx[p_NblockS][p_Nq][p_Nq];
     @shared dfloat s_fluxPy[p_NblockS][p_Nq][p_Nq];
     @shared dfloat s_fluxPz[p_NblockS][p_Nq][p_Nq];
-    
+
     for(int j=0;j<p_Nq;++j){
       for(int es=0;es<p_NblockS;++es;@inner(1)){
-	for(int i=0;i<p_Nq;++i;@inner(0)){
-	  s_fluxPx[es][j][i] = 0.f;
-	  s_fluxPy[es][j][i] = 0.f;
-	  s_fluxPz[es][j][i] = 0.f;
-	}
+        for(int i=0;i<p_Nq;++i;@inner(0)){
+          s_fluxPx[es][j][i] = 0.f;
+          s_fluxPy[es][j][i] = 0.f;
+          s_fluxPz[es][j][i] = 0.f;
+        }
       }
     }
-    
-    @barrier("local");
-    
+
+
     // for all face nodes of all elements
     // face 0 & 2
     for(int es=0;es<p_NblockS;++es;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
-	const dlong e = eo + es;
-	if(e<Nelements){
-	  dlong sk0 = e*p_Nfp*p_Nfaces + 0*p_Nfp + i;
-	  dlong sk2 = e*p_Nfp*p_Nfaces + 2*p_Nfp + i;
-	  
-	  surfaceTerms(sk0,0,i,0     );
-	  surfaceTerms(sk2,2,i,p_Nq-1);
-	}
+        const dlong e = eo + es;
+        if(e<Nelements){
+          dlong sk0 = e*p_Nfp*p_Nfaces + 0*p_Nfp + i;
+          dlong sk2 = e*p_Nfp*p_Nfaces + 2*p_Nfp + i;
+
+          surfaceTerms(sk0,0,i,0     );
+          surfaceTerms(sk2,2,i,p_Nq-1);
+        }
       }
     }
-    
-    @barrier("local");
-    
+
+
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(1)){
       for(int j=0;j<p_Nq;++j;@inner(0)){
-	const dlong e = eo + es;
-	if(e<Nelements){
-	  dlong sk1 = e*p_Nfp*p_Nfaces + 1*p_Nfp + j;
-	  dlong sk3 = e*p_Nfp*p_Nfaces + 3*p_Nfp + j;
-	  
-	  surfaceTerms(sk1,1,p_Nq-1,j);
-	  surfaceTerms(sk3,3,0     ,j);
-	}
+        const dlong e = eo + es;
+        if(e<Nelements){
+          dlong sk1 = e*p_Nfp*p_Nfaces + 1*p_Nfp + j;
+          dlong sk3 = e*p_Nfp*p_Nfaces + 3*p_Nfp + j;
+
+          surfaceTerms(sk1,1,p_Nq-1,j);
+          surfaceTerms(sk3,3,0     ,j);
+        }
       }
     }
-    
-    @barrier("local");
-    
+
+
     // for each node in the element
     for(int j=0;j<p_Nq;++j){
       for(int es=0;es<p_NblockS;++es;@inner(1)){
-	for(int i=0;i<p_Nq;++i;@inner(0)){
-	  const dlong e = eo + es;
-	  if(e<Nelements){
-	    const dlong id = e*p_Np + j*p_Nq + i;
-	    
-	    GP[id+0*fieldOffset] += s_fluxPx[es][j][i];
-	    GP[id+1*fieldOffset] += s_fluxPy[es][j][i];
-	    GP[id+2*fieldOffset] += s_fluxPz[es][j][i];
-	  }
-	}
+        for(int i=0;i<p_Nq;++i;@inner(0)){
+          const dlong e = eo + es;
+          if(e<Nelements){
+            const dlong id = e*p_Np + j*p_Nq + i;
+
+            GP[id+0*fieldOffset] += s_fluxPx[es][j][i];
+            GP[id+1*fieldOffset] += s_fluxPy[es][j][i];
+            GP[id+2*fieldOffset] += s_fluxPz[es][j][i];
+          }
+        }
       }
     }
   }
diff --git a/solvers/ins/okl/insGradientTet3D.okl b/solvers/ins/okl/insGradientTet3D.okl
index f793f7b1e..f2de10720 100644
--- a/solvers/ins/okl/insGradientTet3D.okl
+++ b/solvers/ins/okl/insGradientTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -47,7 +47,6 @@ SOFTWARE.
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(1)){// for all elements in block
       for(int n=0;n<p_Np;++n;@inner(0)){     // for all nodes in this element
@@ -158,7 +157,6 @@ SOFTWARE.
     }
 
     // wait for all flux functions are written to @shared
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
diff --git a/solvers/ins/okl/insGradientTri2D.okl b/solvers/ins/okl/insGradientTri2D.okl
index 1dac7a555..a4a7370da 100644
--- a/solvers/ins/okl/insGradientTri2D.okl
+++ b/solvers/ins/okl/insGradientTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -47,7 +47,6 @@ SOFTWARE.
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(1)){// for all elements in block
       for(int n=0;n<p_Np;++n;@inner(0)){     // for all nodes in this element
@@ -146,7 +145,6 @@ SOFTWARE.
     }
 
     // wait for all flux functions are written to @shared
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -199,7 +197,6 @@ SOFTWARE.
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(1)){// for all elements in block
       for(int n=0;n<p_Np;++n;@inner(0)){     // for all nodes in this element
@@ -234,4 +231,4 @@ SOFTWARE.
     }
   }
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/solvers/ins/okl/insInitialCondition2D.okl b/solvers/ins/okl/insInitialCondition2D.okl
index 74ad8c8e4..43124d7bb 100644
--- a/solvers/ins/okl/insInitialCondition2D.okl
+++ b/solvers/ins/okl/insInitialCondition2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/ins/okl/insInitialCondition3D.okl b/solvers/ins/okl/insInitialCondition3D.okl
index 8bef7bf46..627b24756 100644
--- a/solvers/ins/okl/insInitialCondition3D.okl
+++ b/solvers/ins/okl/insInitialCondition3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/ins/okl/insMaxWaveSpeedHex3D.okl b/solvers/ins/okl/insMaxWaveSpeedHex3D.okl
index 31dc20708..0d5800875 100644
--- a/solvers/ins/okl/insMaxWaveSpeedHex3D.okl
+++ b/solvers/ins/okl/insMaxWaveSpeedHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/ins/okl/insMaxWaveSpeedQuad2D.okl b/solvers/ins/okl/insMaxWaveSpeedQuad2D.okl
index d56e8b7b6..0babefe72 100644
--- a/solvers/ins/okl/insMaxWaveSpeedQuad2D.okl
+++ b/solvers/ins/okl/insMaxWaveSpeedQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/ins/okl/insMaxWaveSpeedTet3D.okl b/solvers/ins/okl/insMaxWaveSpeedTet3D.okl
index d9bc37286..b5baaacfe 100644
--- a/solvers/ins/okl/insMaxWaveSpeedTet3D.okl
+++ b/solvers/ins/okl/insMaxWaveSpeedTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/ins/okl/insMaxWaveSpeedTri2D.okl b/solvers/ins/okl/insMaxWaveSpeedTri2D.okl
index 7cdb325ac..068e54ea5 100644
--- a/solvers/ins/okl/insMaxWaveSpeedTri2D.okl
+++ b/solvers/ins/okl/insMaxWaveSpeedTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/ins/okl/insPressureIncrementRhsHex3D.okl b/solvers/ins/okl/insPressureIncrementRhsHex3D.okl
index c16e424b5..389b1c123 100644
--- a/solvers/ins/okl/insPressureIncrementRhsHex3D.okl
+++ b/solvers/ins/okl/insPressureIncrementRhsHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -106,7 +106,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -121,7 +120,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -133,7 +131,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -155,7 +152,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -167,7 +163,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -189,7 +184,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // Layer by layer
     #pragma unroll p_Nq
@@ -209,7 +203,6 @@ SOFTWARE.
           }
         }
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -225,7 +218,6 @@ SOFTWARE.
           }
         }
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -249,7 +241,6 @@ SOFTWARE.
           }
         }
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -363,7 +354,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -376,7 +366,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -395,7 +384,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -408,7 +396,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -434,7 +421,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -448,7 +434,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -474,7 +459,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //layer by layer
     #pragma unroll p_Nq
@@ -506,7 +490,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -523,7 +506,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
     }
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -585,7 +567,6 @@ SOFTWARE.
     }
 
     if (pDisc_c0) { //fill masked boundary points if C0
-      @barrier("local");
 
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -597,7 +578,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       // face 1 & 3
       for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -609,7 +589,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       // face 2 & 4
       for(int k=0;k<p_Nq;++k;@inner(1)){
diff --git a/solvers/ins/okl/insPressureIncrementRhsQuad2D.okl b/solvers/ins/okl/insPressureIncrementRhsQuad2D.okl
index c4a63821c..8712d048c 100644
--- a/solvers/ins/okl/insPressureIncrementRhsQuad2D.okl
+++ b/solvers/ins/okl/insPressureIncrementRhsQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -88,7 +88,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 0 & 2
     for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -99,7 +98,6 @@ SOFTWARE.
       surfaceTerms(sk2,2,i,p_Nq-1);
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(0)){
@@ -110,7 +108,6 @@ SOFTWARE.
       surfaceTerms(sk3,3,0     ,j);
     }
 
-    @barrier("local");
 
     // loop over slabs
     for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -133,7 +130,6 @@ SOFTWARE.
     }
 
     // r term ----->
-    @barrier("local");
 
     for(int i=0;i<p_Nq;++i;@inner(0)){
       for(int j=0;j<p_Nq;++j){
@@ -141,7 +137,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int i=0;i<p_Nq;++i;@inner(0)){
       for(int j=0;j<p_Nq;++j){
@@ -156,7 +151,6 @@ SOFTWARE.
     }
 
     // s term ---->
-    @barrier("local");
 
     for(int i=0;i<p_Nq;++i;@inner(0)){
       for(int j=0;j<p_Nq;++j){
@@ -164,7 +158,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int i=0;i<p_Nq;++i;@inner(0)){
       for(int j=0;j<p_Nq;++j){
@@ -256,7 +249,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // loop over faces to add pseudo-gradient
 
@@ -269,7 +261,6 @@ SOFTWARE.
       surfaceTerms(sk2,2,i,p_Nq-1);
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(0)){
@@ -280,7 +271,6 @@ SOFTWARE.
       surfaceTerms(sk3,3,0     ,j);
     }
 
-    @barrier("local");
 
     // prescale by geofacs
     for(int j=0;j<p_Nq;++j){
@@ -301,7 +291,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // loop over slabs
     for(int j=0;j<p_Nq;++j){
@@ -365,7 +354,6 @@ SOFTWARE.
     }
 
     if (pDisc_c0) { //fill masked boundary points if C0
-      @barrier("local");
       // face 0 & 2
       for(int i=0;i<p_Nq;++i;@inner(0)){
         const dlong sk0 = e*p_Nfp*p_Nfaces + 0*p_Nfp + i;
@@ -375,7 +363,6 @@ SOFTWARE.
         surfaceTerms(sk2,2,i,p_Nq-1);
       }
 
-      @barrier("local");
 
       // face 1 & 3
       for(int j=0;j<p_Nq;++j;@inner(0)){
diff --git a/solvers/ins/okl/insPressureIncrementRhsTet3D.okl b/solvers/ins/okl/insPressureIncrementRhsTet3D.okl
index bb5ee9c05..4c8d820dd 100644
--- a/solvers/ins/okl/insPressureIncrementRhsTet3D.okl
+++ b/solvers/ins/okl/insPressureIncrementRhsTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,6 +26,7 @@ SOFTWARE.
 
 // compute RHS = MM*RHS/gamma + BCdata
 @kernel void insPressureIncrementRhsTet3D(const dlong Nelements,
+                               @restrict const  dfloat *  wJ,
                                @restrict const  dfloat *  vgeo,
                                @restrict const  dfloat *  sgeo,
                                @restrict const  dfloat *  ggeo,
@@ -62,7 +63,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_NfacesNfp){
@@ -99,7 +99,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
@@ -141,7 +140,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
@@ -151,12 +149,10 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
-        const int gid = e*p_Nggeo;
-        const dfloat J = ggeo[gid + p_GWJID];
+        const dfloat J = wJ[e];
 
         dfloat Mp=0.0;
 
@@ -174,6 +170,7 @@ SOFTWARE.
 
 // compute RHS = MM*RHS/gamma + BCdata
 @kernel void insPressureIncrementIpdgRhsTet3D(const dlong Nelements,
+                               @restrict const  dfloat *  wJ,
                                @restrict const  dfloat *  vgeo,
                                @restrict const  dfloat *  sgeo,
                                @restrict const  dfloat *  ggeo,
@@ -253,7 +250,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // dpdx += LIFT*(sJ/J)*nx*dp
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
@@ -297,7 +293,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_NfacesNfp){
@@ -323,7 +318,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
@@ -339,7 +333,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
@@ -384,7 +377,6 @@ SOFTWARE.
     }
 
     if (pDisc_c0) { //fill masked boundary points if C0
-      @barrier("local");
 
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
         if(n<p_NfacesNfp){
@@ -407,4 +399,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/ins/okl/insPressureIncrementRhsTri2D.okl b/solvers/ins/okl/insPressureIncrementRhsTri2D.okl
index d65d9e10b..102b583f6 100644
--- a/solvers/ins/okl/insPressureIncrementRhsTri2D.okl
+++ b/solvers/ins/okl/insPressureIncrementRhsTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,6 +26,7 @@ SOFTWARE.
 
 // compute RHS = MM*RHS/gamma + BCdata
 @kernel void insPressureIncrementRhsTri2D(const dlong Nelements,
+                               @restrict const  dfloat *  wJ,
                                @restrict const  dfloat *  vgeo,
                                @restrict const  dfloat *  sgeo,
                                @restrict const  dfloat *  ggeo,
@@ -62,7 +63,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_NfacesNfp){
@@ -99,7 +99,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
@@ -130,7 +129,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
@@ -140,12 +138,10 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
-        const int gid = e*p_Nggeo;
-        const dfloat J = ggeo[gid + p_GWJID];
+        const dfloat J = wJ[e];
 
         dfloat Mp=0.0;
 
@@ -163,6 +159,7 @@ SOFTWARE.
 
 // compute RHS = MM*RHS/gamma + BCdata
 @kernel void insPressureIncrementIpdgRhsTri2D(const dlong Nelements,
+                               @restrict const  dfloat *  wJ,
                                @restrict const  dfloat *  vgeo,
                                @restrict const  dfloat *  sgeo,
                                @restrict const  dfloat *  ggeo,
@@ -236,7 +233,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // dpdx += LIFT*(sJ/J)*nx*dp
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
@@ -267,7 +263,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_NfacesNfp){
@@ -292,7 +287,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
@@ -307,7 +301,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
@@ -351,7 +344,6 @@ SOFTWARE.
     }
 
     if (pDisc_c0) { //fill masked boundary points if C0
-      @barrier("local");
 
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
         if(n<p_NfacesNfp){
@@ -373,4 +365,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/ins/okl/insPressureRhsHex3D.okl b/solvers/ins/okl/insPressureRhsHex3D.okl
index 2cb789635..a2579693d 100644
--- a/solvers/ins/okl/insPressureRhsHex3D.okl
+++ b/solvers/ins/okl/insPressureRhsHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -45,6 +45,7 @@ SOFTWARE.
 
 // compute RHS = MM*RHS/gamma + BCdata
 @kernel void insPressureRhsHex3D(const dlong Nelements,
+                               @restrict const  dfloat *  wJ,
                                @restrict const  dfloat *  vgeo,
                                @restrict const  dfloat *  sgeo,
                                @restrict const  dfloat *  ggeo,
@@ -98,7 +99,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -113,7 +113,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -125,7 +124,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -147,7 +145,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -159,7 +156,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -181,7 +177,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // Layer by layer
     #pragma unroll p_Nq
@@ -201,7 +196,6 @@ SOFTWARE.
           }
         }
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -217,7 +211,6 @@ SOFTWARE.
           }
         }
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -241,7 +234,6 @@ SOFTWARE.
           }
         }
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -263,8 +255,7 @@ SOFTWARE.
       for(int i=0;i<p_Nq;++i;@inner(0)){
         #pragma unroll p_Nq
         for(int k = 0; k < p_Nq; k++){
-          const dlong gid = i + j*p_Nq + k*p_Nq*p_Nq + e*p_Np*p_Nvgeo;
-          const dfloat JW = vgeo[gid + p_JWID*p_Np];
+          const dfloat JW = wJ[i + j*p_Nq + k*p_Nq*p_Nq + e*p_Np];
 
           const dlong id = e*p_Np +k*p_Nq*p_Nq+ j*p_Nq + i;
           RHS[id] = JW*RHS[id]/gamma - r_rhs[k];
@@ -300,6 +291,7 @@ SOFTWARE.
 
 // compute RHS = MM*RHS/gamma + BCdata
 @kernel void insPressureIpdgRhsHex3D(const dlong Nelements,
+                               @restrict const  dfloat *  wJ,
                                @restrict const  dfloat *  vgeo,
                                @restrict const  dfloat *  sgeo,
                                @restrict const  dfloat *  ggeo,
@@ -347,7 +339,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -360,7 +351,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -379,7 +369,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -392,7 +381,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -418,7 +406,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -432,7 +419,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -458,7 +444,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //layer by layer
     #pragma unroll p_Nq
@@ -490,7 +475,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -507,7 +491,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
     }
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -515,10 +498,9 @@ SOFTWARE.
 
         #pragma unroll p_Nq
         for(int k=0;k<p_Nq;++k){
-          const dlong gid = i + j*p_Nq + k*p_Nq*p_Nq + e*p_Np*p_Nvgeo;
-          const dfloat JW = vgeo[gid + p_JWID*p_Np];
-
           const dlong id = e*p_Np + k*p_Nq*p_Nq + j*p_Nq + i;
+          const dfloat JW = wJ[id];
+
           RHS[id] = JW*RHS[id]/gamma - r_rhsp[k];
         }
       }
@@ -567,7 +549,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -579,7 +560,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int k=0;k<p_Nq;++k;@inner(1)){
diff --git a/solvers/ins/okl/insPressureRhsQuad2D.okl b/solvers/ins/okl/insPressureRhsQuad2D.okl
index 214799ce9..be275d76d 100644
--- a/solvers/ins/okl/insPressureRhsQuad2D.okl
+++ b/solvers/ins/okl/insPressureRhsQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -43,6 +43,7 @@ SOFTWARE.
 }
 
 @kernel void insPressureRhsQuad2D(const dlong Nelements,
+                               @restrict const  dfloat *  wJ,
                                @restrict const  dfloat *  vgeo,
                                @restrict const  dfloat *  sgeo,
                                @restrict const  dfloat *  ggeo,
@@ -81,7 +82,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 0 & 2
     for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -92,7 +92,6 @@ SOFTWARE.
       surfaceTerms(sk2,2,i,p_Nq-1);
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(0)){
@@ -103,7 +102,6 @@ SOFTWARE.
       surfaceTerms(sk3,3,0     ,j);
     }
 
-    @barrier("local");
 
     // loop over slabs
     for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -126,7 +124,6 @@ SOFTWARE.
     }
 
     // r term ----->
-    @barrier("local");
 
     for(int i=0;i<p_Nq;++i;@inner(0)){
       for(int j=0;j<p_Nq;++j){
@@ -134,7 +131,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int i=0;i<p_Nq;++i;@inner(0)){
       for(int j=0;j<p_Nq;++j){
@@ -149,7 +145,6 @@ SOFTWARE.
     }
 
     // s term ---->
-    @barrier("local");
 
     for(int i=0;i<p_Nq;++i;@inner(0)){
       for(int j=0;j<p_Nq;++j){
@@ -157,12 +152,10 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int i=0;i<p_Nq;++i;@inner(0)){
       for(int j=0;j<p_Nq;++j){
-        const dlong gid = i + j*p_Nq + e*p_Np*p_Nvgeo;
-        const dfloat JW = vgeo[gid + p_JWID*p_Np];
+        const dfloat JW = wJ[i + j*p_Nq + e*p_Np];
 
         dfloat tmp = 0.f;
 
@@ -204,6 +197,7 @@ SOFTWARE.
 
 // compute RHS = MM*RHS/gamma + BCdata
 @kernel void insPressureIpdgRhsQuad2D(const dlong Nelements,
+                               @restrict const  dfloat *  wJ,
                                @restrict const  dfloat *  vgeo,
                                @restrict const  dfloat *  sgeo,
                                @restrict const  dfloat *  ggeo,
@@ -242,7 +236,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // loop over faces to add pseudo-gradient
 
@@ -255,7 +248,6 @@ SOFTWARE.
       surfaceTerms(sk2,2,i,p_Nq-1);
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(0)){
@@ -266,7 +258,6 @@ SOFTWARE.
       surfaceTerms(sk3,3,0     ,j);
     }
 
-    @barrier("local");
 
     // prescale by geofacs
     for(int j=0;j<p_Nq;++j){
@@ -287,14 +278,12 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // loop over slabs
     for(int j=0;j<p_Nq;++j){
       for(int i=0;i<p_Nq;++i;@inner(0)){
         // does need the nasty geofacs
-        const dlong gid = i + j*p_Nq + e*p_Np*p_Nvgeo;
-        const dfloat JW = vgeo[gid + p_JWID*p_Np];
+        const dfloat JW = wJ[i + j*p_Nq + e*p_Np];
 
         dfloat dpr = 0, dps = 0;
 
@@ -349,7 +338,6 @@ SOFTWARE.
       surfaceTerms(sk2,2,i,p_Nq-1);
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(0)){
diff --git a/solvers/ins/okl/insPressureRhsQuad3D.okl b/solvers/ins/okl/insPressureRhsQuad3D.okl
index 50a4106b3..3dc1f6fa3 100644
--- a/solvers/ins/okl/insPressureRhsQuad3D.okl
+++ b/solvers/ins/okl/insPressureRhsQuad3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,7 +26,7 @@ SOFTWARE.
 
 // Computes volume contribution of div(UI)
 @kernel void insPressureRhsQuad3D(const dlong Nelements,
-                                   @restrict const  dfloat *  vgeo,
+                                   @restrict const  dfloat *  wJ,
                                    @restrict const  dfloat *  MM,
                                    const dfloat idt,
                                     @restrict dfloat *  rhs){
@@ -37,11 +37,10 @@ SOFTWARE.
       for(int n=0;n<p_Np;++n;@inner(0)){     // for all nodes in this element
         const dlong e = eo+es; // element in block
         if(e<Nelements){
-	  const dlong id = e*p_Np+n;
-	  const dfloat rhsid = rhs[id];
-	  
-          const dlong gid = n + e*p_Np*p_Nvgeo;
-          const dfloat JW = vgeo[gid + p_JWID*p_Np];
+          const dlong id = e*p_Np+n;
+          const dfloat rhsid = rhs[id];
+
+          const dfloat JW = wJ[n + e*p_Np];
 
           rhs[id] = -JW*rhsid*idt;
         }
diff --git a/solvers/ins/okl/insPressureRhsTet3D.okl b/solvers/ins/okl/insPressureRhsTet3D.okl
index 4234b8494..6c0191d88 100644
--- a/solvers/ins/okl/insPressureRhsTet3D.okl
+++ b/solvers/ins/okl/insPressureRhsTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,6 +26,7 @@ SOFTWARE.
 
 // compute RHS = MM*RHS/gamma + BCdata
 @kernel void insPressureRhsTet3D(const dlong Nelements,
+                               @restrict const  dfloat *  wJ,
                                @restrict const  dfloat *  vgeo,
                                @restrict const  dfloat *  sgeo,
                                @restrict const  dfloat *  ggeo,
@@ -61,7 +62,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_NfacesNfp){
@@ -91,7 +91,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
@@ -133,7 +132,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
@@ -143,12 +141,10 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
-        const int gid = e*p_Nggeo;
-        const dfloat J = ggeo[gid + p_GWJID];
+        const dfloat J = wJ[e];
 
         dfloat Mp=0.0;
 
@@ -166,6 +162,7 @@ SOFTWARE.
 
 // compute RHS = MM*RHS/gamma + BCdata
 @kernel void insPressureIpdgRhsTet3D(const dlong Nelements,
+                               @restrict const  dfloat *  wJ,
                                @restrict const  dfloat *  vgeo,
                                @restrict const  dfloat *  sgeo,
                                @restrict const  dfloat *  ggeo,
@@ -237,7 +234,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // dpdx += LIFT*(sJ/J)*nx*dp
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
@@ -281,7 +277,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_NfacesNfp){
@@ -307,7 +302,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
@@ -323,11 +317,10 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
-        const dfloat J = vgeo[e*p_Nvgeo + p_JID];
+        const dfloat J = wJ[e];
 
         dfloat Mlaps = 0.f;
 
@@ -375,4 +368,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/ins/okl/insPressureRhsTri2D.okl b/solvers/ins/okl/insPressureRhsTri2D.okl
index ddbe5dfdd..5c57c6e01 100644
--- a/solvers/ins/okl/insPressureRhsTri2D.okl
+++ b/solvers/ins/okl/insPressureRhsTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,6 +26,7 @@ SOFTWARE.
 
 // compute RHS = MM*RHS/gamma + BCdata
 @kernel void insPressureRhsTri2D(const dlong Nelements,
+                               @restrict const  dfloat *  wJ,
                                @restrict const  dfloat *  vgeo,
                                @restrict const  dfloat *  sgeo,
                                @restrict const  dfloat *  ggeo,
@@ -61,7 +62,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_NfacesNfp){
@@ -90,7 +90,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
@@ -121,7 +120,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
@@ -131,12 +129,10 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
-        const int gid = e*p_Nggeo;
-        const dfloat J = ggeo[gid + p_GWJID];
+        const dfloat J = wJ[e];
 
         dfloat Mp=0.0;
 
@@ -154,6 +150,7 @@ SOFTWARE.
 
 // compute RHS = MM*RHS/gamma + BCdata
 @kernel void insPressureIpdgRhsTri2D(const dlong Nelements,
+                               @restrict const  dfloat *  wJ,
                                @restrict const  dfloat *  vgeo,
                                @restrict const  dfloat *  sgeo,
                                @restrict const  dfloat *  ggeo,
@@ -218,7 +215,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // dpdx += LIFT*(sJ/J)*nx*dp
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
@@ -249,7 +245,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_NfacesNfp){
@@ -274,7 +269,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
@@ -289,11 +283,10 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
-        const dfloat J = vgeo[e*p_Nvgeo + p_JID];
+        const dfloat J = wJ[e];
 
         dfloat Mlaps = 0.f;
         // multiply by mass matrix
@@ -339,4 +332,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/ins/okl/insSubcycleAdvection.okl b/solvers/ins/okl/insSubcycleAdvection.okl
index 18eb51749..192bc6eda 100644
--- a/solvers/ins/okl/insSubcycleAdvection.okl
+++ b/solvers/ins/okl/insSubcycleAdvection.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/ins/okl/insSubcycleAdvectionHex3D.okl b/solvers/ins/okl/insSubcycleAdvectionHex3D.okl
index 1567c148c..ae142c593 100644
--- a/solvers/ins/okl/insSubcycleAdvectionHex3D.okl
+++ b/solvers/ins/okl/insSubcycleAdvectionHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -62,7 +62,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     #pragma unroll p_Nq
     for(int k=0;k<p_Nq;++k){
@@ -106,7 +105,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -130,7 +128,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
     } //k loop
 
     //write out
@@ -240,7 +237,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -257,7 +253,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -270,7 +265,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -294,7 +288,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -307,7 +300,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -331,7 +323,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
diff --git a/solvers/ins/okl/insSubcycleAdvectionQuad2D.okl b/solvers/ins/okl/insSubcycleAdvectionQuad2D.okl
index da0c298b0..17d8d8022 100644
--- a/solvers/ins/okl/insSubcycleAdvectionQuad2D.okl
+++ b/solvers/ins/okl/insSubcycleAdvectionQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -77,7 +77,6 @@ SOFTWARE.
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -181,7 +180,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // for all face nodes of all elements
     // face 0 & 2
@@ -198,7 +196,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -214,7 +211,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
diff --git a/solvers/ins/okl/insSubcycleAdvectionQuad3D.okl b/solvers/ins/okl/insSubcycleAdvectionQuad3D.okl
index fbdb37381..1c9c6eb72 100644
--- a/solvers/ins/okl/insSubcycleAdvectionQuad3D.okl
+++ b/solvers/ins/okl/insSubcycleAdvectionQuad3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -90,7 +90,6 @@ SOFTWARE.
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(2)){ 
       for(int j=0;j<p_Nq;++j;@inner(1)){ 
@@ -198,7 +197,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // for all face nodes of all elements
     // face 0 & 2
@@ -215,7 +213,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -231,7 +228,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -323,7 +319,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //interpolate in i, store in register
     for(int es=0;es<p_NblockV;++es;@inner(2)){   
@@ -348,7 +343,6 @@ SOFTWARE.
       }
     }
     
-    @barrier("local");
     
     //write register back to @shared
     for(int es=0;es<p_NblockV;++es;@inner(2)){   
@@ -366,7 +360,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //interpolate in j and store flux in register
     for(int es=0;es<p_NblockV;++es;@inner(2)){   
@@ -389,7 +382,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(2)){   
       for(int j=0;j<p_cubNq;++j;@inner(1)){ 
@@ -419,7 +411,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //project/differentiate in j
     for(int es=0;es<p_NblockV;++es;@inner(2)){   
@@ -446,7 +437,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int es=0;es<p_NblockV;++es;@inner(2)){   
@@ -464,7 +454,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");    
 
     //project/differentiate in i and write back 
     for(int es=0;es<p_NblockV;++es;@inner(2)){ 
@@ -575,7 +564,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //interpolate in i, store in register
     for(int es=0;es<p_NblockV;++es;@inner(2)){   
@@ -618,7 +606,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //interpolate in j and store flux in register
     for(int es=0;es<p_NblockV;++es;@inner(2)){   
@@ -675,7 +662,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //project/differentiate in j
     for(int es=0;es<p_NblockV;++es;@inner(2)){   
@@ -720,7 +706,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");    
 
     //project/differentiate in i and write back 
     for(int es=0;es<p_NblockV;++es;@inner(2)){ 
@@ -893,7 +878,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //interpolate traces, store flux in register
     for(int es=0;es<p_NblockS;++es;@inner(1)){   
@@ -932,7 +916,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local"); //need a barrier since s_fluxNU and s_fluxNV are aliased
 
     //write fluxes to @shared
     for(int es=0;es<p_NblockS;++es;@inner(1)){   
@@ -974,7 +957,6 @@ SOFTWARE.
       }
     }
     
-    @barrier("local");
     
     // for all face nodes of all elements
     // face 0 & 2
@@ -987,7 +969,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -999,7 +980,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
diff --git a/solvers/ins/okl/insSubcycleAdvectionTet3D.okl b/solvers/ins/okl/insSubcycleAdvectionTet3D.okl
index aea2d86b5..bab04681d 100644
--- a/solvers/ins/okl/insSubcycleAdvectionTet3D.okl
+++ b/solvers/ins/okl/insSubcycleAdvectionTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -95,7 +95,6 @@ SOFTWARE.
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
@@ -233,7 +232,6 @@ SOFTWARE.
     }
 
     // wait for all flux functions are written to @shared
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
diff --git a/solvers/ins/okl/insSubcycleAdvectionTri2D.okl b/solvers/ins/okl/insSubcycleAdvectionTri2D.okl
index 1b64a224e..d02a6805a 100644
--- a/solvers/ins/okl/insSubcycleAdvectionTri2D.okl
+++ b/solvers/ins/okl/insSubcycleAdvectionTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -61,7 +61,6 @@ SOFTWARE.
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
@@ -191,7 +190,6 @@ SOFTWARE.
     }
 
     // wait for all flux functions are written to @shared
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
diff --git a/solvers/ins/okl/insSubcycleCubatureAdvectionHex3D.okl b/solvers/ins/okl/insSubcycleCubatureAdvectionHex3D.okl
index b36f0e175..f6462b3c0 100644
--- a/solvers/ins/okl/insSubcycleCubatureAdvectionHex3D.okl
+++ b/solvers/ins/okl/insSubcycleCubatureAdvectionHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -94,7 +94,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //read in and interpolate in k
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -125,7 +124,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     #pragma unroll p_cubNq
     for(int k=0;k<p_cubNq;++k){
@@ -143,7 +141,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -165,7 +162,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -180,7 +176,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -199,7 +194,6 @@ SOFTWARE.
           }
         }
       }
-      @barrier("local");
     }
 
 
@@ -244,7 +238,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -268,7 +261,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
     } //k loop
 
 
@@ -283,7 +275,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -301,7 +292,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -313,7 +303,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -331,7 +320,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
     }
 
     //project in k and write out
@@ -390,7 +378,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -410,7 +397,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -425,7 +411,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -443,7 +428,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -469,7 +453,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -489,7 +472,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -504,7 +486,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -522,7 +503,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -559,7 +539,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -575,7 +554,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -586,7 +564,6 @@ SOFTWARE.
         }                                                               \
       }                                                                 \
     }                                                                   \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -602,7 +579,6 @@ SOFTWARE.
       }                                                                 \
     }                                                                   \
                                                                         \
-    @barrier("local");                                                  \
                                                                         \
     for(int j=0;j<p_cubNq;++j;@inner(1)){                               \
       for(int i=0;i<p_cubNq;++i;@inner(0)){                             \
@@ -683,11 +659,9 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     quadSurfaceTerms(0) //face 0
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -700,11 +674,9 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     quadSurfaceTerms(5) //face 5
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -717,11 +689,9 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     quadSurfaceTerms(1) //face 1
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -737,11 +707,9 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     quadSurfaceTerms(3) //face 3
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -757,11 +725,9 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     quadSurfaceTerms(2) //face 2
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -777,11 +743,9 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     quadSurfaceTerms(4) //face 4
 
-    @barrier("local");
 
     //accumulate in register pencil
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -797,7 +761,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int j=0;j<p_cubNq;++j;@inner(1)){
       for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -861,7 +824,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //read in and interpolate in k
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -886,7 +848,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     #pragma unroll p_cubNq
     for(int k=0;k<p_cubNq;++k){
@@ -901,7 +862,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -919,7 +879,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -931,7 +890,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -951,7 +909,6 @@ SOFTWARE.
           cU[id+2*p_Np] = r_W[k];
         }
       }
-      @barrier("local");
     }
 
     //read in and interpolate in k
@@ -984,7 +941,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     #pragma unroll p_cubNq
     for(int k=0;k<p_cubNq;++k){
@@ -999,7 +955,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -1017,7 +972,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -1029,7 +983,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -1049,7 +1002,6 @@ SOFTWARE.
           cUd[id+2*p_Np] = r_W[k];
         }
       }
-      @barrier("local");
     }
   }
 
@@ -1086,7 +1038,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     #pragma unroll p_cubNq
     for(int k=0;k<p_cubNq;++k){
@@ -1130,7 +1081,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -1154,7 +1104,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
     } //k loop
 
     //write out
@@ -1198,7 +1147,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //read in and project in k
     for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -1221,7 +1169,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     #pragma unroll p_Nq
     for(int k=0;k<p_Nq;++k){
@@ -1234,7 +1181,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -1252,7 +1198,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -1264,7 +1209,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -1289,7 +1233,6 @@ SOFTWARE.
           }
         }
       }
-      @barrier("local");
     }
   }
 }
@@ -1356,7 +1299,6 @@ SOFTWARE.
 
     for(int c=0;c<p_Nq;++c){
 
-      @barrier("local");
 
       for(int b=0;b<p_cubNq;++b;@inner(1)){
         for(int a=0;a<p_cubNq;++a;@inner(0)){
@@ -1376,7 +1318,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       // interpolate in 'r'
       for(int b=0;b<p_cubNq;++b;@inner(1)){
@@ -1406,7 +1347,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       // interpolate in 's'
       for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -1443,7 +1383,6 @@ SOFTWARE.
 
     for(int k=0;k<p_cubNq;++k){
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -1455,7 +1394,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -1518,7 +1456,6 @@ SOFTWARE.
     // now project back in t
     for(int c=0;c<p_Nq;++c){
 
-      @barrier("local");
 
       for(int j=0;j<p_cubNq;++j;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -1538,7 +1475,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int b=0;b<p_cubNq;++b;@inner(1)){
         for(int i=0;i<p_cubNq;++i;@inner(0)){
@@ -1559,7 +1495,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int b=0;b<p_cubNq;++b;@inner(1)){
         for(int a=0;a<p_cubNq;++a;@inner(0)){
diff --git a/solvers/ins/okl/insSubcycleCubatureAdvectionQuad2D.okl b/solvers/ins/okl/insSubcycleCubatureAdvectionQuad2D.okl
index deb72cede..57acf4180 100644
--- a/solvers/ins/okl/insSubcycleCubatureAdvectionQuad2D.okl
+++ b/solvers/ins/okl/insSubcycleCubatureAdvectionQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -81,7 +81,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //interpolate in i, store in register
     for(int es=0;es<p_cubNblockV;++es;@inner(2)){
@@ -104,7 +103,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int es=0;es<p_cubNblockV;++es;@inner(2)){
@@ -120,7 +118,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //interpolate in j and store flux in register
     for(int es=0;es<p_cubNblockV;++es;@inner(2)){
@@ -141,7 +138,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_cubNblockV;++es;@inner(2)){
       for(int j=0;j<p_cubNq;++j;@inner(1)){
@@ -167,7 +163,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //project/differentiate in j
     for(int es=0;es<p_cubNblockV;++es;@inner(2)){
@@ -191,7 +186,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //write register back to @shared
     for(int es=0;es<p_cubNblockV;++es;@inner(2)){
@@ -207,7 +201,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //project/differentiate in i and write back
     for(int es=0;es<p_cubNblockV;++es;@inner(2)){
@@ -361,7 +354,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //interpolate traces, store flux in register
     for(int es=0;es<p_cubNblockS;++es;@inner(1)){
@@ -393,7 +385,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local"); //need a barrier since s_fluxNU and s_fluxNV are aliased
 
     //write fluxes to @shared
     for(int es=0;es<p_cubNblockS;++es;@inner(1)){
@@ -433,7 +424,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // for all face nodes of all elements
     // face 0 & 2
@@ -446,7 +436,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int es=0;es<p_cubNblockS;++es;@inner(1)){
@@ -458,7 +447,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_cubNblockS;++es;@inner(1)){
diff --git a/solvers/ins/okl/insSubcycleCubatureAdvectionTet3D.okl b/solvers/ins/okl/insSubcycleCubatureAdvectionTet3D.okl
index 1dcda8f5a..a181c219c 100644
--- a/solvers/ins/okl/insSubcycleCubatureAdvectionTet3D.okl
+++ b/solvers/ins/okl/insSubcycleCubatureAdvectionTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -94,7 +94,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_cubNblockV;++es;@inner(1)){// for all elements in block
       for(int n=0;n<p_cubNp;++n;@inner(0)){     // for all nodes in this element
@@ -132,7 +131,6 @@ SOFTWARE.
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_cubNblockV;++es;@inner(1)){
       for(int n=0;n<p_cubNp;++n;@inner(0)){
@@ -250,7 +248,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // interpolate to surface integration nodes
     for(int es=0;es<p_cubNblockS;++es;@inner(1)){
@@ -331,7 +328,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // lift from surface integration to volume nodes
     for(int es=0;es<p_cubNblockS;++es;@inner(1)){
diff --git a/solvers/ins/okl/insSubcycleCubatureAdvectionTri2D.okl b/solvers/ins/okl/insSubcycleCubatureAdvectionTri2D.okl
index c510453b4..57c880f30 100644
--- a/solvers/ins/okl/insSubcycleCubatureAdvectionTri2D.okl
+++ b/solvers/ins/okl/insSubcycleCubatureAdvectionTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -139,7 +139,6 @@ SOFTWARE.
     //#pragma unroll
     for(int io=0;io<p_cubNp;io+=p_Np){ // block the cubature nodes
 
-      @barrier("local");
 
       for(int es=0;es<p_NbV;++es;@inner(1)){// for all elements in block
         for(int ns=0;ns<p_Np;++ns;@inner(0)){     // for all nodes in this element
@@ -181,7 +180,6 @@ SOFTWARE.
       }
 
       // Make sure all node data is loaded into @shared
-      @barrier("local");
 
       for(int es=0;es<p_NbV;++es;@inner(1)){
         for(int n=0;n<p_Np;++n;@inner(0)){
@@ -346,7 +344,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // interpolate to surface integration nodes
     for(int es=0;es<p_NblockS4;++es;@inner(1)){
@@ -375,7 +372,6 @@ SOFTWARE.
         }
       }
     }
-    @barrier("local");
 
     // for all face nodes of all elements
     for(int es=0;es<p_NblockS4;++es;@inner(1)){
@@ -394,7 +390,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // interpolate to surface integration nodes
     for(int es=0;es<p_NblockS4;++es;@inner(1)){
@@ -421,7 +416,6 @@ SOFTWARE.
         }
       }
     }
-    @barrier("local");
 
       // Loop for positive traces
     // for all face nodes of all elements
@@ -441,7 +435,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // interpolate to surface integration nodes
     for(int es=0;es<p_NblockS4;++es;@inner(1)){
@@ -468,7 +461,6 @@ SOFTWARE.
         }
       }
     }
-    @barrier("local");
 
     // for all face nodes of all elements
     for(int es=0;es<p_NblockS4;++es;@inner(1)){
@@ -487,7 +479,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // interpolate to surface integration nodes
     for(int es=0;es<p_NblockS4;++es;@inner(1)){
@@ -514,7 +505,6 @@ SOFTWARE.
         }
       }
     }
-    @barrier("local");
 
     // Use traces to compuite and store flux
     for(int es=0;es<p_NblockS4;++es;@inner(1)){
@@ -555,7 +545,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // lift from surface integration to volume nodes
     for(int es=0;es<p_NblockS4;++es;@inner(1)){
diff --git a/solvers/ins/okl/insVelocityGradientHex3D.okl b/solvers/ins/okl/insVelocityGradientHex3D.okl
index 393abb9b4..7b16c3a8f 100644
--- a/solvers/ins/okl/insVelocityGradientHex3D.okl
+++ b/solvers/ins/okl/insVelocityGradientHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -55,7 +55,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       // loop over slabs
       for(int k=0;k<p_Nq;++k){
diff --git a/solvers/ins/okl/insVelocityGradientQuad2D.okl b/solvers/ins/okl/insVelocityGradientQuad2D.okl
index 251937632..e0f846391 100644
--- a/solvers/ins/okl/insVelocityGradientQuad2D.okl
+++ b/solvers/ins/okl/insVelocityGradientQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -49,7 +49,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -81,4 +80,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/ins/okl/insVelocityGradientQuad3D.okl b/solvers/ins/okl/insVelocityGradientQuad3D.okl
index a168ae34f..bf29dd8a5 100644
--- a/solvers/ins/okl/insVelocityGradientQuad3D.okl
+++ b/solvers/ins/okl/insVelocityGradientQuad3D.okl
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -52,7 +52,6 @@
         }
       }
       
-      @barrier("local");
       
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
diff --git a/solvers/ins/okl/insVelocityGradientTet3D.okl b/solvers/ins/okl/insVelocityGradientTet3D.okl
index 7302734be..c63ef0e9b 100644
--- a/solvers/ins/okl/insVelocityGradientTet3D.okl
+++ b/solvers/ins/okl/insVelocityGradientTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -48,7 +48,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(dlong e=eo;e<eo+p_NblockV;++e;@inner(1)){
         for(int n=0;n<p_Np;++n;@inner(0)){
diff --git a/solvers/ins/okl/insVelocityGradientTri2D.okl b/solvers/ins/okl/insVelocityGradientTri2D.okl
index aee905710..86492f9dd 100644
--- a/solvers/ins/okl/insVelocityGradientTri2D.okl
+++ b/solvers/ins/okl/insVelocityGradientTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -103,7 +103,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int es=0;es<p_NbV; ++es; @inner(1)){
         for(int n=0;n<p_Np;++n;@inner(0)){
@@ -157,4 +156,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/ins/okl/insVelocityRhsHex3D.okl b/solvers/ins/okl/insVelocityRhsHex3D.okl
index 3430c5791..bbce5e175 100644
--- a/solvers/ins/okl/insVelocityRhsHex3D.okl
+++ b/solvers/ins/okl/insVelocityRhsHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -52,6 +52,7 @@ SOFTWARE.
 // compute RHS = MM*RHS/nu + BCdata
 // and split fields to separate arrays
 @kernel void insVelocityRhsHex3D(const dlong Nelements,
+                               @restrict const  dfloat *  wJ,
                                @restrict const  dfloat *  vgeo,
                                @restrict const  dfloat *  sgeo,
                                @restrict const  dfloat *  ggeo,
@@ -125,7 +126,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -148,7 +148,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -160,7 +159,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -190,7 +188,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -202,7 +199,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -232,7 +228,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // Layer by layer
     #pragma unroll p_Nq
@@ -250,11 +245,10 @@ SOFTWARE.
             r_G12 = ggeo[gbase+p_G12ID*p_Np];
             r_G22 = ggeo[gbase+p_G22ID*p_Np];
 
-            r_GwJ = ggeo[gbase+p_GWJID*p_Np];
+            r_GwJ = wJ[e*p_Np + k*p_Nq*p_Nq + j*p_Nq + i];
           }
         }
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -277,7 +271,6 @@ SOFTWARE.
           }
         }
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -317,7 +310,6 @@ SOFTWARE.
           }
         }
 
-        @barrier("local");
 
         for(int j=0;j<p_Nq;++j;@inner(1)){
           for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -348,8 +340,7 @@ SOFTWARE.
       for(int i=0;i<p_Nq;++i;@inner(0)){
         #pragma unroll p_Nq
         for(int k = 0; k < p_Nq; k++){
-          const dlong gid = i + j*p_Nq + k*p_Nq*p_Nq + e*p_Np*p_Nvgeo;
-          const dfloat JW = vgeo[gid + p_JWID*p_Np];
+          const dfloat JW = wJ[i + j*p_Nq + k*p_Nq*p_Nq + e*p_Np];
 
           const dlong id = e*p_Np +k*p_Nq*p_Nq+ j*p_Nq + i;
           const dlong iid = e*p_Np*p_NVfields + k*p_Nq*p_Nq + j*p_Nq + i;
@@ -408,6 +399,7 @@ SOFTWARE.
 // compute RHS = MM*RHS/nu + BCdata
 // and split fields to separate arrays
 @kernel void insVelocityIpdgRhsHex3D(const dlong Nelements,
+                               @restrict const  dfloat *  wJ,
                                @restrict const  dfloat *  vgeo,
                                @restrict const  dfloat *  sgeo,
                                @restrict const  dfloat *  ggeo,
@@ -480,7 +472,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -490,7 +481,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 0
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -511,7 +501,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -521,7 +510,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -542,7 +530,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -552,7 +539,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 1
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -577,7 +563,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 3
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -587,7 +572,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 3
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -612,7 +596,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 2
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -622,7 +605,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 2
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -647,7 +629,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 4
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -657,7 +638,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 4
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -682,7 +662,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     //layer by layer
     #pragma unroll p_Nq
@@ -726,7 +705,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -756,7 +734,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
     }
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -764,8 +741,7 @@ SOFTWARE.
 
         #pragma unroll p_Nq
         for(int k=0;k<p_Nq;++k){
-          const dlong gid = i + j*p_Nq + k*p_Nq*p_Nq + e*p_Np*p_Nvgeo;
-          const dfloat JW = vgeo[gid + p_JWID*p_Np];
+          const dfloat JW = wJ[i + j*p_Nq + k*p_Nq*p_Nq + e*p_Np];
 
           const dlong iid = e*p_Np*p_NVfields + k*p_Nq*p_Nq + j*p_Nq + i;
           const dlong id = e*p_Np + k*p_Nq*p_Nq + j*p_Nq + i;
@@ -838,7 +814,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       // face 1 & 3
       for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -850,7 +825,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       // face 2 & 4
       for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -861,7 +835,6 @@ SOFTWARE.
           surfaceTerms(sk4);
         }
       }
-      @barrier("local");
     }
     // loop over slabs
     for(int j=0;j<p_Nq;++j;@inner(1)){
diff --git a/solvers/ins/okl/insVelocityRhsQuad2D.okl b/solvers/ins/okl/insVelocityRhsQuad2D.okl
index 147cc9d37..941f14df6 100644
--- a/solvers/ins/okl/insVelocityRhsQuad2D.okl
+++ b/solvers/ins/okl/insVelocityRhsQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -49,6 +49,7 @@ SOFTWARE.
 // compute RHS = MM*RHS/nu + BCdata
 // and split fields to separate arrays
 @kernel void insVelocityRhsQuad2D(const dlong Nelements,
+                               @restrict const  dfloat *  wJ,
                                @restrict const  dfloat *  vgeo,
                                @restrict const  dfloat *  sgeo,
                                @restrict const  dfloat *  ggeo,
@@ -99,7 +100,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // face 0 & 2
     for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -110,7 +110,6 @@ SOFTWARE.
       surfaceTerms(sk2,2,i,p_Nq-1);
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(0)){
@@ -121,7 +120,6 @@ SOFTWARE.
       surfaceTerms(sk3,3,0     ,j);
     }
 
-    @barrier("local");
 
     // loop over slabs
     for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -131,7 +129,7 @@ SOFTWARE.
         r_G00[j] = ggeo[base+p_G00ID*p_Np];
         r_G01[j] = ggeo[base+p_G01ID*p_Np];
         r_G11[j] = ggeo[base+p_G11ID*p_Np];
-        r_GwJ[j] = ggeo[base+p_GWJID*p_Np];
+        r_GwJ[j] = wJ[e*p_Np + j*p_Nq + i];
 
         dfloat ur = 0.f, us = 0.f;
         dfloat vr = 0.f, vs = 0.f;
@@ -152,7 +150,6 @@ SOFTWARE.
     }
 
     // r term ----->
-    @barrier("local");
 
     for(int i=0;i<p_Nq;++i;@inner(0)){
       for(int j=0;j<p_Nq;++j){
@@ -161,7 +158,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int i=0;i<p_Nq;++i;@inner(0)){
       for(int j=0;j<p_Nq;++j){
@@ -179,7 +175,6 @@ SOFTWARE.
     }
 
     // s term ---->
-    @barrier("local");
 
     for(int i=0;i<p_Nq;++i;@inner(0)){
       for(int j=0;j<p_Nq;++j){
@@ -188,12 +183,10 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int i=0;i<p_Nq;++i;@inner(0)){
       for(int j=0;j<p_Nq;++j){
-        const dlong gid = i + j*p_Nq + e*p_Np*p_Nvgeo;
-        const dfloat JW = vgeo[gid + p_JWID*p_Np];
+        const dfloat JW = wJ[i + j*p_Nq + e*p_Np];
 
         dfloat tmpu = 0.f;
         dfloat tmpv = 0.f;
@@ -252,6 +245,7 @@ SOFTWARE.
 // compute RHS = MM*RHS/nu + BCdata
 // and split fields to separate arrays
 @kernel void insVelocityIpdgRhsQuad2D(const dlong Nelements,
+                               @restrict const  dfloat *  wJ,
                                @restrict const  dfloat *  vgeo,
                                @restrict const  dfloat *  sgeo,
                                @restrict const  dfloat *  ggeo,
@@ -303,7 +297,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // loop over faces to add pseudo-gradient
 
@@ -316,7 +309,6 @@ SOFTWARE.
       surfaceTerms(sk2,2,i,p_Nq-1);
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int j=0;j<p_Nq;++j;@inner(0)){
@@ -327,7 +319,6 @@ SOFTWARE.
       surfaceTerms(sk3,3,0     ,j);
     }
 
-    @barrier("local");
 
     // prescale by geofacs
     for(int j=0;j<p_Nq;++j){
@@ -352,14 +343,11 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     // loop over slabs
     for(int j=0;j<p_Nq;++j){
       for(int i=0;i<p_Nq;++i;@inner(0)){
-        // does need the nasty geofacs
-        const dlong gid = i + j*p_Nq + e*p_Np*p_Nvgeo;
-        const dfloat JW = vgeo[gid + p_JWID*p_Np];
+        const dfloat JW = wJ[i + j*p_Nq + e*p_Np];
 
         dfloat dur = 0, dus = 0;
         dfloat dvr = 0, dvs = 0;
@@ -433,7 +421,6 @@ SOFTWARE.
         surfaceTerms(sk2);
       }
 
-      @barrier("local");
 
       // face 1 & 3
       for(int j=0;j<p_Nq;++j;@inner(0)){
@@ -443,7 +430,6 @@ SOFTWARE.
         surfaceTerms(sk1);
         surfaceTerms(sk3);
       }
-      @barrier("local");
     }
 
     // loop over slabs
@@ -456,4 +442,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/ins/okl/insVelocityRhsQuad3D.okl b/solvers/ins/okl/insVelocityRhsQuad3D.okl
index a63af0c95..5860447d2 100644
--- a/solvers/ins/okl/insVelocityRhsQuad3D.okl
+++ b/solvers/ins/okl/insVelocityRhsQuad3D.okl
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -26,28 +26,28 @@
 
 // Compute RHS Forcing i.e. 1/nu*(-grad(Pr) + sum(a_i x u_i)^(n-i) -sum(b_i x N(u)^(n-i))
 @kernel void insVelocityRhsARKQuad3D(const dlong Nelements,
-				     const int stage,  
-				     @restrict const  dfloat *  vgeo,
-				     @restrict const  dfloat *  MM,
-				     const dfloat idt,
-				     const dfloat inu,
-				     @restrict const  dfloat *  erkA,
-				     @restrict const  dfloat *  irkA,
-				     @restrict const  dfloat *  prkA,
-				     @restrict const  dfloat *  prkB,
-				     const dlong fieldOffset,
-				     @restrict const  dfloat *  U,
-				     @restrict const  dfloat *  NU,
-				     @restrict const  dfloat *  LU,
-				     @restrict const  dfloat *  GP,
-				     @restrict dfloat *  rhsU,
-				     @restrict dfloat *  rhsV,
-				     @restrict dfloat *  rhsW){
+                                     const int stage,
+                                     @restrict const  dfloat *  vgeo,
+                                     @restrict const  dfloat *  MM,
+                                     const dfloat idt,
+                                     const dfloat inu,
+                                     @restrict const  dfloat *  erkA,
+                                     @restrict const  dfloat *  irkA,
+                                     @restrict const  dfloat *  prkA,
+                                     @restrict const  dfloat *  prkB,
+                                     const dlong fieldOffset,
+                                     @restrict const  dfloat *  U,
+                                     @restrict const  dfloat *  NU,
+                                     @restrict const  dfloat *  LU,
+                                     @restrict const  dfloat *  GP,
+                                     @restrict dfloat *  rhsU,
+                                     @restrict dfloat *  rhsV,
+                                     @restrict dfloat *  rhsW){
 
   for(dlong eo=0;eo<Nelements;eo+=p_NblockV;@outer(0)){
 
     for(int es=0;es<p_NblockV;++es;@inner(2)){// for all elements in block
-      for(int j=0;j<p_Nq;++j;@inner(1)){ 
+      for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
           const dlong e = eo+es; // element in block
           if(e<Nelements){
@@ -91,20 +91,20 @@
 
 // rhsU^s = MM*(U^n - \sum^s-1 ea_si N(U^i) + \sum^s-1 ia_si LU^i - \sum^s-1 pa_si GP^i)/ia_ss nu dt
 @kernel void insVelocityRhsEXTBDFQuad3D(const dlong Nelements,
-					@restrict const  dfloat *  vgeo,
-					@restrict const  dfloat *  MM,
-					const dfloat idt,
-					const dfloat inu,
-					@restrict const  dfloat *  extbdfA,
-					@restrict const  dfloat *  extbdfB,
-					@restrict const  dfloat *  extbdfC,
-					const dlong fieldOffset,
-					@restrict const  dfloat *  U,
-					@restrict const  dfloat *  NU,
-					@restrict const  dfloat *  GP,
-					@restrict dfloat *  rhsU,
-					@restrict dfloat *  rhsV,
-					@restrict dfloat *  rhsW){
+                                        @restrict const  dfloat *  vgeo,
+                                        @restrict const  dfloat *  MM,
+                                        const dfloat idt,
+                                        const dfloat inu,
+                                        @restrict const  dfloat *  extbdfA,
+                                        @restrict const  dfloat *  extbdfB,
+                                        @restrict const  dfloat *  extbdfC,
+                                        const dlong fieldOffset,
+                                        @restrict const  dfloat *  U,
+                                        @restrict const  dfloat *  NU,
+                                        @restrict const  dfloat *  GP,
+                                        @restrict dfloat *  rhsU,
+                                        @restrict dfloat *  rhsV,
+                                        @restrict dfloat *  rhsW){
 
   for(dlong eo=0;eo<Nelements;eo+=p_NblockV;@outer(0)){
 
diff --git a/solvers/ins/okl/insVelocityRhsTet3D.okl b/solvers/ins/okl/insVelocityRhsTet3D.okl
index d8010e3a8..ad5393f68 100644
--- a/solvers/ins/okl/insVelocityRhsTet3D.okl
+++ b/solvers/ins/okl/insVelocityRhsTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,6 +27,7 @@ SOFTWARE.
 // compute RHS = MM*RHS/nu + BCdata
 // and split fields to separate arrays
 @kernel void insVelocityRhsTet3D(const dlong Nelements,
+                               @restrict const  dfloat *  wJ,
                                @restrict const  dfloat *  vgeo,
                                @restrict const  dfloat *  sgeo,
                                @restrict const  dfloat *  ggeo,
@@ -77,7 +78,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_NfacesNfp){
@@ -113,7 +113,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
@@ -173,7 +172,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
@@ -185,12 +183,10 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
-        const int gid = e*p_Nggeo;
-        const dfloat J = ggeo[gid + p_GWJID];
+        const dfloat J = wJ[e];
 
         dfloat Mu=0.0, Mv=0.0, Mw=0.0;
 
@@ -219,6 +215,7 @@ SOFTWARE.
 // compute RHS = MM*RHS/nu + BCdata
 // and split fields to separate arrays
 @kernel void insVelocityIpdgRhsTet3D(const dlong Nelements,
+                               @restrict const  dfloat *  wJ,
                                @restrict const  dfloat *  vgeo,
                                @restrict const  dfloat *  sgeo,
                                @restrict const  dfloat *  ggeo,
@@ -307,7 +304,6 @@ SOFTWARE.
     }
 
     for (int fld=0;fld<p_NVfields;fld++) {
-      @barrier("local");
 
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
         if(n<p_NfacesNfp){
@@ -317,7 +313,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       // dqdx += LIFT*(sJ/J)*nx*dq
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
@@ -362,7 +357,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
         if(n<p_NfacesNfp){
@@ -388,7 +382,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
         if(n<p_Np){
@@ -404,11 +397,10 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
         if(n<p_Np){
-          const dfloat J = vgeo[e*p_Nvgeo + p_JID];
+          const dfloat J = wJ[e];
 
           dfloat Mlaps = 0.f;
 
@@ -485,7 +477,6 @@ SOFTWARE.
           }
         }
       }
-      @barrier("local");
     }
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
diff --git a/solvers/ins/okl/insVelocityRhsTri2D.okl b/solvers/ins/okl/insVelocityRhsTri2D.okl
index d44e4465a..83855efdb 100644
--- a/solvers/ins/okl/insVelocityRhsTri2D.okl
+++ b/solvers/ins/okl/insVelocityRhsTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,6 +27,7 @@ SOFTWARE.
 // compute RHS = MM*RHS/nu + BCdata
 // and split fields to separate arrays
 @kernel void insVelocityRhsTri2D(const dlong Nelements,
+                               @restrict const  dfloat *  wJ,
                                @restrict const  dfloat *  vgeo,
                                @restrict const  dfloat *  sgeo,
                                @restrict const  dfloat *  ggeo,
@@ -73,7 +74,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_NfacesNfp){
@@ -105,7 +105,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
@@ -143,7 +142,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
@@ -154,12 +152,10 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
       if(n<p_Np){
-        const int gid = e*p_Nggeo;
-        const dfloat J = ggeo[gid + p_GWJID];
+        const dfloat J = wJ[e];
 
         dfloat Mu=0.0, Mv=0.0;
 
@@ -184,6 +180,7 @@ SOFTWARE.
 // compute RHS = MM*RHS/nu + BCdata
 // and split fields to separate arrays
 @kernel void insVelocityIpdgRhsTri2D(const dlong Nelements,
+                               @restrict const  dfloat *  wJ,
                                @restrict const  dfloat *  vgeo,
                                @restrict const  dfloat *  sgeo,
                                @restrict const  dfloat *  ggeo,
@@ -263,7 +260,6 @@ SOFTWARE.
     }
 
     for (int fld=0;fld<p_NVfields;fld++) {
-      @barrier("local");
 
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
         if(n<p_NfacesNfp){
@@ -272,7 +268,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       // dqdx += LIFT*(sJ/J)*nx*dq
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
@@ -303,7 +298,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
         if(n<p_NfacesNfp){
@@ -328,7 +322,6 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
         if(n<p_Np){
@@ -343,11 +336,10 @@ SOFTWARE.
         }
       }
 
-      @barrier("local");
 
       for(int n=0;n<p_maxNodes;++n;@inner(0)){
         if(n<p_Np){
-          const dfloat J = vgeo[e*p_Nvgeo + p_JID];
+          const dfloat J = wJ[e];
 
           dfloat Mlaps = 0.f;
           // multiply by mass matrix
@@ -415,7 +407,6 @@ SOFTWARE.
           }
         }
       }
-      @barrier("local");
     }
 
     for(int n=0;n<p_maxNodes;++n;@inner(0)){
diff --git a/solvers/ins/okl/insVorticityHex3D.okl b/solvers/ins/okl/insVorticityHex3D.okl
index e8e259473..5b61b252d 100644
--- a/solvers/ins/okl/insVorticityHex3D.okl
+++ b/solvers/ins/okl/insVorticityHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -55,7 +55,6 @@ SOFTWARE.
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int k=0;k<p_Nq;++k;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -181,7 +180,6 @@ if (p1 < TOL){
     }
 
     // Make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int k=0;k<p_Nq;++k;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -289,4 +287,4 @@ if (p1 < TOL){
     }
   }
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/solvers/ins/okl/insVorticityQuad2D.okl b/solvers/ins/okl/insVorticityQuad2D.okl
index fc9d47bb6..3b99b18f9 100644
--- a/solvers/ins/okl/insVorticityQuad2D.okl
+++ b/solvers/ins/okl/insVorticityQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -54,7 +54,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -92,4 +91,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/ins/okl/insVorticityQuad3D.okl b/solvers/ins/okl/insVorticityQuad3D.okl
index 909b7e68e..26d329798 100644
--- a/solvers/ins/okl/insVorticityQuad3D.okl
+++ b/solvers/ins/okl/insVorticityQuad3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -56,7 +56,6 @@ SOFTWARE.
       }
     }
           
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(2)){ 
       for(int j=0;j<p_Nq;++j;@inner(1)){ 
diff --git a/solvers/ins/okl/insVorticityTet3D.okl b/solvers/ins/okl/insVorticityTet3D.okl
index 0ddab5d4f..d2d9ebfa2 100644
--- a/solvers/ins/okl/insVorticityTet3D.okl
+++ b/solvers/ins/okl/insVorticityTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -48,7 +48,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(int e=eo;e<eo+p_NblockV;++e;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
@@ -174,7 +173,6 @@ void eigenValue(const dfloat a11, const dfloat a12, const dfloat a13, const dflo
       }
     }
 
-    @barrier("local");
 
     for(int e=eo;e<eo+p_NblockV;++e;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
diff --git a/solvers/ins/okl/insVorticityTri2D.okl b/solvers/ins/okl/insVorticityTri2D.okl
index abe93c34a..3a30e1cfa 100644
--- a/solvers/ins/okl/insVorticityTri2D.okl
+++ b/solvers/ins/okl/insVorticityTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -46,7 +46,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
 
     for(dlong e=eo;e<eo+p_NblockV;++e;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
diff --git a/solvers/ins/src/insAdvection.cpp b/solvers/ins/src/insAdvection.cpp
index 52b654417..d684a25dc 100644
--- a/solvers/ins/src/insAdvection.cpp
+++ b/solvers/ins/src/insAdvection.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,11 +27,11 @@ SOFTWARE.
 #include "ins.hpp"
 
 // compute RHS = beta*RHS + alpha*N(U)
-void ins_t::Advection(const dfloat alpha, occa::memory& o_U,
-                      const dfloat beta,  occa::memory& o_RHS,
+void ins_t::Advection(const dfloat alpha, deviceMemory<dfloat>& o_U,
+                      const dfloat beta,  deviceMemory<dfloat>& o_RHS,
                       const dfloat T) {
 
-  vTraceHalo->ExchangeStart(o_U, 1, ogs_dfloat);
+  vTraceHalo.ExchangeStart(o_U, 1);
 
   if (cubature)
     advectionVolumeKernel(mesh.Nelements,
@@ -54,7 +54,7 @@ void ins_t::Advection(const dfloat alpha, occa::memory& o_U,
                          o_U,
                          o_RHS);
 
-  vTraceHalo->ExchangeFinish(o_U, 1, ogs_dfloat);
+  vTraceHalo.ExchangeFinish(o_U, 1);
 
   if (cubature)
     advectionSurfaceKernel(mesh.Nelements,
diff --git a/solvers/ins/src/insDiffusion.cpp b/solvers/ins/src/insDiffusion.cpp
index 49f355cca..c8813d9b9 100644
--- a/solvers/ins/src/insDiffusion.cpp
+++ b/solvers/ins/src/insDiffusion.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,8 +27,8 @@ SOFTWARE.
 #include "ins.hpp"
 
 // compute RHS = beta*RHS + alpha*L(U)
-void ins_t::Diffusion(const dfloat alpha, occa::memory& o_U,
-                      const dfloat beta,  occa::memory& o_RHS,
+void ins_t::Diffusion(const dfloat alpha, deviceMemory<dfloat>& o_U,
+                      const dfloat beta,  deviceMemory<dfloat>& o_RHS,
                       const dfloat T) {
 
   //IPDG
@@ -39,7 +39,7 @@ void ins_t::Diffusion(const dfloat alpha, occa::memory& o_U,
                         o_GU);
 
   // dfloat4 storage -> 4 entries
-  vTraceHalo->ExchangeStart(o_GU, 4, ogs_dfloat);
+  vTraceHalo.ExchangeStart(o_GU, 4);
 
   if(mesh.NinternalElements)
     diffusionKernel(mesh.NinternalElements,
@@ -62,7 +62,7 @@ void ins_t::Diffusion(const dfloat alpha, occa::memory& o_U,
                    o_GU,
                    o_RHS);
 
-  vTraceHalo->ExchangeFinish(o_GU, 4, ogs_dfloat);
+  vTraceHalo.ExchangeFinish(o_GU, 4);
 
   if(mesh.NhaloElements)
     diffusionKernel(mesh.NhaloElements,
@@ -84,4 +84,4 @@ void ins_t::Diffusion(const dfloat alpha, occa::memory& o_U,
                     beta,
                     o_GU,
                     o_RHS);
-}
\ No newline at end of file
+}
diff --git a/solvers/ins/src/insDivergence.cpp b/solvers/ins/src/insDivergence.cpp
index e5366c9c7..12d838802 100644
--- a/solvers/ins/src/insDivergence.cpp
+++ b/solvers/ins/src/insDivergence.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,11 +27,11 @@ SOFTWARE.
 #include "ins.hpp"
 
 // compute RHS = beta*RHS + alpha*div U
-void ins_t::Divergence(const dfloat alpha, occa::memory& o_U,
-                       const dfloat beta,  occa::memory& o_RHS,
+void ins_t::Divergence(const dfloat alpha, deviceMemory<dfloat>& o_U,
+                       const dfloat beta,  deviceMemory<dfloat>& o_RHS,
                        const dfloat T){
 
-  vTraceHalo->ExchangeStart(o_U, 1, ogs_dfloat);
+  vTraceHalo.ExchangeStart(o_U, 1);
 
   // computes div u^(n+1) volume term
   divergenceVolumeKernel(mesh.Nelements,
@@ -42,7 +42,7 @@ void ins_t::Divergence(const dfloat alpha, occa::memory& o_U,
                          o_U,
                          o_RHS);
 
-  vTraceHalo->ExchangeFinish(o_U, 1, ogs_dfloat);
+  vTraceHalo.ExchangeFinish(o_U, 1);
 
   divergenceSurfaceKernel(mesh.Nelements,
                            mesh.o_sgeo,
diff --git a/solvers/ins/src/insGradient.cpp b/solvers/ins/src/insGradient.cpp
index 88577f68f..36b2019bc 100644
--- a/solvers/ins/src/insGradient.cpp
+++ b/solvers/ins/src/insGradient.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,11 +27,11 @@ SOFTWARE.
 #include "ins.hpp"
 
 // compute RHS = beta*RHS + alpha*grad P
-void ins_t::Gradient(const dfloat alpha, occa::memory& o_P,
-                     const dfloat beta,  occa::memory& o_RHS,
+void ins_t::Gradient(const dfloat alpha, deviceMemory<dfloat>& o_P,
+                     const dfloat beta,  deviceMemory<dfloat>& o_RHS,
                      const dfloat T){
 
-  pTraceHalo->ExchangeStart(o_P, 1, ogs_dfloat);
+  pTraceHalo.ExchangeStart(o_P, 1);
 
   // Compute Volume Contribution
   gradientVolumeKernel(mesh.Nelements,
@@ -42,7 +42,7 @@ void ins_t::Gradient(const dfloat alpha, occa::memory& o_P,
                       o_P,
                       o_RHS);
 
-  pTraceHalo->ExchangeFinish(o_P, 1, ogs_dfloat);
+  pTraceHalo.ExchangeFinish(o_P, 1);
 
   // Compute Surface Conribution
   gradientSurfaceKernel(mesh.Nelements,
diff --git a/solvers/ins/src/insPlotFields.cpp b/solvers/ins/src/insPlotFields.cpp
index 1b9f375f1..e65e037d4 100644
--- a/solvers/ins/src/insPlotFields.cpp
+++ b/solvers/ins/src/insPlotFields.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,11 +27,11 @@ SOFTWARE.
 #include "ins.hpp"
 
 // interpolate data to plot nodes and save to file (one per process
-void ins_t::PlotFields(dfloat* U, dfloat* P, dfloat *V, char *fileName){
+void ins_t::PlotFields(memory<dfloat>& U, memory<dfloat>& P, memory<dfloat>& V, std::string fileName){
 
   FILE *fp;
 
-  fp = fopen(fileName, "w");
+  fp = fopen(fileName.c_str(), "w");
 
   fprintf(fp, "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"BigEndian\">\n");
   fprintf(fp, "  <UnstructuredGrid>\n");
@@ -44,36 +44,42 @@ void ins_t::PlotFields(dfloat* U, dfloat* P, dfloat *V, char *fileName){
   fprintf(fp, "        <DataArray type=\"Float32\" NumberOfComponents=\"3\" Format=\"ascii\">\n");
 
   //scratch space for interpolation
-  size_t NscratchBytes = mymax(mesh.Np, mesh.plotNp)*sizeof(dfloat);
-  dfloat* scratch = (dfloat *) malloc(2*NscratchBytes);
+  size_t Nscratch = std::max(mesh.Np, mesh.plotNp);
+  memory<dfloat> scratch(2*Nscratch);
 
-  dfloat* Ix = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iy = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iz = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
+  memory<dfloat> Ix(mesh.plotNp);
+  memory<dfloat> Iy(mesh.plotNp);
+  memory<dfloat> Iz(mesh.plotNp);
 
   // compute plot node coordinates on the fly
   for(dlong e=0;e<mesh.Nelements;++e){
     mesh.PlotInterp(mesh.x + e*mesh.Np, Ix, scratch);
     mesh.PlotInterp(mesh.y + e*mesh.Np, Iy, scratch);
-    mesh.PlotInterp(mesh.z + e*mesh.Np, Iz, scratch);
+    if(mesh.dim==3)
+      mesh.PlotInterp(mesh.z + e*mesh.Np, Iz, scratch);
 
-    for(int n=0;n<mesh.plotNp;++n){
-      fprintf(fp, "       ");
-      fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],Iz[n]);
+    if (mesh.dim==2) {
+      for(int n=0;n<mesh.plotNp;++n){
+        fprintf(fp, "       ");
+        fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],0.0);
+      }
+    } else {
+      for(int n=0;n<mesh.plotNp;++n){
+        fprintf(fp, "       ");
+        fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],Iz[n]);
+      }
     }
   }
   fprintf(fp, "        </DataArray>\n");
   fprintf(fp, "      </Points>\n");
 
-  free(Ix); free(Iy); free(Iz);
-
-  dfloat* Ip = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iu = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iv = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iw = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
+  memory<dfloat> Ip(mesh.plotNp);
+  memory<dfloat> Iu(mesh.plotNp);
+  memory<dfloat> Iv(mesh.plotNp);
+  memory<dfloat> Iw(mesh.plotNp);
 
   fprintf(fp, "      <PointData Scalars=\"scalars\">\n");
-  if (U!=nullptr) {
+  if (U.length()!=0) {
     // write out velocity
     fprintf(fp, "        <DataArray type=\"Float32\" Name=\"Velocity\" NumberOfComponents=\"%d\" Format=\"ascii\">\n", mesh.dim);
     for(dlong e=0;e<mesh.Nelements;++e){
@@ -94,7 +100,7 @@ void ins_t::PlotFields(dfloat* U, dfloat* P, dfloat *V, char *fileName){
     fprintf(fp, "       </DataArray>\n");
   }
 
-  if (P!=nullptr) {
+  if (P.length()!=0) {
     // write out pressure
     fprintf(fp, "        <DataArray type=\"Float32\" Name=\"Pressure\" Format=\"ascii\">\n");
     for(dlong e=0;e<mesh.Nelements;++e){
@@ -108,7 +114,7 @@ void ins_t::PlotFields(dfloat* U, dfloat* P, dfloat *V, char *fileName){
     fprintf(fp, "       </DataArray>\n");
   }
 
-  if (V!=nullptr) {
+  if (V.length()!=0) {
     // write out vorticity
     if(mesh.dim==2){
       fprintf(fp, "        <DataArray type=\"Float32\" Name=\"Vorticity\" Format=\"ascii\">\n");
@@ -138,8 +144,6 @@ void ins_t::PlotFields(dfloat* U, dfloat* P, dfloat *V, char *fileName){
   }
   fprintf(fp, "     </PointData>\n");
 
-  free(Ip); free(Iu); free(Iv); free(Iw);
-
   fprintf(fp, "    <Cells>\n");
   fprintf(fp, "      <DataArray type=\"Int32\" Name=\"connectivity\" Format=\"ascii\">\n");
 
@@ -180,6 +184,4 @@ void ins_t::PlotFields(dfloat* U, dfloat* P, dfloat *V, char *fileName){
   fprintf(fp, "  </UnstructuredGrid>\n");
   fprintf(fp, "</VTKFile>\n");
   fclose(fp);
-
-  free(scratch);
 }
diff --git a/solvers/ins/src/insPressureIncrementSolve.cpp b/solvers/ins/src/insPressureIncrementSolve.cpp
index 7c69ac544..fad714c62 100644
--- a/solvers/ins/src/insPressureIncrementSolve.cpp
+++ b/solvers/ins/src/insPressureIncrementSolve.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -28,11 +28,12 @@ SOFTWARE.
 
 //  Solves -gamma*Laplacian*PI = rhs
 //  P += PI
-void ins_t::PressureIncrementSolve(occa::memory& o_P, occa::memory& o_RHS,
+void ins_t::PressureIncrementSolve(deviceMemory<dfloat>& o_P, deviceMemory<dfloat>& o_RHS,
                                    const dfloat gamma, const dfloat T, const dfloat dt){
 
   // compute RHS = MM*RHS/gamma + BCdata
   pressureIncrementRhsKernel(mesh.Nelements,
+                    mesh.o_wJ,
                     mesh.o_vgeo,
                     mesh.o_sgeo,
                     mesh.o_ggeo,
@@ -43,7 +44,7 @@ void ins_t::PressureIncrementSolve(occa::memory& o_P, occa::memory& o_RHS,
                     mesh.o_sM,
                     mesh.o_vmapM,
                     mesh.o_EToB,
-                    o_mapB,
+                    mesh.o_mapB,
                     pTau,
                     T,
                     dt,
@@ -60,18 +61,18 @@ void ins_t::PressureIncrementSolve(occa::memory& o_P, occa::memory& o_RHS,
   //  Solve - Laplacian*PI = RHS
   if(pDisc_c0) {
     // gather, solve, scatter
-    pSolver->ogsMasked->Gather(o_GrhsP, o_RHS, ogs_dfloat, ogs_add, ogs_trans);
-    NiterP = pSolver->Solve(*pLinearSolver, o_GPI, o_GrhsP, presTOL, maxIter, verbose);
-    pSolver->ogsMasked->Scatter(o_PI, o_GPI, ogs_dfloat, ogs_add, ogs_notrans);
+    pSolver.ogsMasked.Gather(o_GrhsP, o_RHS, 1, ogs::Add, ogs::Trans);
+    NiterP = pSolver.Solve(pLinearSolver, o_GPI, o_GrhsP, presTOL, maxIter, verbose);
+    pSolver.ogsMasked.Scatter(o_PI, o_GPI, 1, ogs::NoTrans);
   } else {
-    NiterP = pSolver->Solve(*pLinearSolver, o_PI, o_RHS, presTOL, maxIter, verbose);
+    NiterP = pSolver.Solve(pLinearSolver, o_PI, o_RHS, presTOL, maxIter, verbose);
   }
 
   // P += PI and enter BCs if C0
   pressureIncrementBCKernel(mesh.Nelements,
                    mesh.o_sgeo,
                    mesh.o_vmapM,
-                   o_mapB,
+                   mesh.o_mapB,
                    T,
                    mesh.o_x,
                    mesh.o_y,
diff --git a/solvers/ins/src/insPressureSolve.cpp b/solvers/ins/src/insPressureSolve.cpp
index 36d34cfe8..894e590c1 100644
--- a/solvers/ins/src/insPressureSolve.cpp
+++ b/solvers/ins/src/insPressureSolve.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,11 +27,12 @@ SOFTWARE.
 #include "ins.hpp"
 
 //  Solves -gamma*Laplacian*P = rhs
-void ins_t::PressureSolve(occa::memory& o_P, occa::memory& o_RHS,
+void ins_t::PressureSolve(deviceMemory<dfloat>& o_P, deviceMemory<dfloat>& o_RHS,
                           const dfloat gamma, const dfloat T){
 
   // compute RHS = MM*RHS/gamma + BCdata
   pressureRhsKernel(mesh.Nelements,
+                    mesh.o_wJ,
                     mesh.o_vgeo,
                     mesh.o_sgeo,
                     mesh.o_ggeo,
@@ -42,7 +43,7 @@ void ins_t::PressureSolve(occa::memory& o_P, occa::memory& o_RHS,
                     mesh.o_sM,
                     mesh.o_vmapM,
                     mesh.o_EToB,
-                    o_mapB,
+                    mesh.o_mapB,
                     pTau,
                     T,
                     mesh.o_x,
@@ -58,15 +59,15 @@ void ins_t::PressureSolve(occa::memory& o_P, occa::memory& o_RHS,
 
   if(pDisc_c0) {
     // gather, solve, scatter
-    pSolver->ogsMasked->Gather(o_GrhsP, o_RHS, ogs_dfloat, ogs_add, ogs_trans);
-    NiterP = pSolver->Solve(*pLinearSolver, o_GP, o_GrhsP, presTOL, maxIter, verbose);
-    pSolver->ogsMasked->Scatter(o_P, o_GP, ogs_dfloat, ogs_add, ogs_notrans);
+    pSolver.ogsMasked.Gather(o_GrhsP, o_RHS, 1, ogs::Add, ogs::Trans);
+    NiterP = pSolver.Solve(pLinearSolver, o_GP, o_GrhsP, presTOL, maxIter, verbose);
+    pSolver.ogsMasked.Scatter(o_P, o_GP, 1, ogs::NoTrans);
 
     // enter BCs if C0
     pressureBCKernel(mesh.Nelements,
                      mesh.o_sgeo,
                      mesh.o_vmapM,
-                     o_mapB,
+                     mesh.o_mapB,
                      T,
                      mesh.o_x,
                      mesh.o_y,
@@ -74,6 +75,6 @@ void ins_t::PressureSolve(occa::memory& o_P, occa::memory& o_RHS,
                      nu,
                      o_P);
   } else {
-    NiterP = pSolver->Solve(*pLinearSolver, o_P, o_RHS, presTOL, maxIter, verbose);
+    NiterP = pSolver.Solve(pLinearSolver, o_P, o_RHS, presTOL, maxIter, verbose);
   }
 }
diff --git a/solvers/ins/src/insReport.cpp b/solvers/ins/src/insReport.cpp
index 03e4c77f9..4a9fa1b36 100644
--- a/solvers/ins/src/insReport.cpp
+++ b/solvers/ins/src/insReport.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -34,7 +34,7 @@ void ins_t::Report(dfloat time, int tstep){
   mesh.MassMatrixApply(o_u, o_MU);
 
   dlong Nentries = mesh.Nelements*mesh.Np*NVfields;
-  dfloat norm2 = sqrt(linAlg.innerProd(Nentries, o_u, o_MU, mesh.comm));
+  dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_u, o_MU, mesh.comm));
 
   if(mesh.rank==0)
     printf("\n%5.2f (%d), %5.2f (time, timestep, norm)\n", time, tstep, norm2);
@@ -49,11 +49,11 @@ void ins_t::Report(dfloat time, int tstep){
     o_Vort.copyTo(Vort);
 
     // output field files
-    string name;
+    std::string name;
     settings.getSetting("OUTPUT FILE NAME", name);
     char fname[BUFSIZ];
     sprintf(fname, "%s_%04d_%04d.vtu", name.c_str(), mesh.rank, frame++);
 
-    PlotFields(u, p, Vort, fname);
+    PlotFields(u, p, Vort, std::string(fname));
   }
 }
diff --git a/solvers/ins/src/insRun.cpp b/solvers/ins/src/insRun.cpp
index 4b2dca40b..9b328508a 100644
--- a/solvers/ins/src/insRun.cpp
+++ b/solvers/ins/src/insRun.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -56,14 +56,14 @@ void ins_t::Run(){
     dt = dtAdvc;
   } else if (settings.compareSetting("TIME INTEGRATOR","SSBDF3")) {
     dt = Nsubcycles*dtAdvc;
-    subStepper->SetTimeStep(dtAdvc);
+    subStepper.SetTimeStep(dtAdvc);
   } else {
-    dt = mymin(dtAdvc, dtDiff);
+    dt = std::min(dtAdvc, dtDiff);
   }
 
-  timeStepper->SetTimeStep(dt);
+  timeStepper.SetTimeStep(dt);
 
-  timeStepper->Run(o_u, startTime, finalTime);
+  timeStepper.Run(*this, o_u, startTime, finalTime);
 
   // output norm of final solution
   {
@@ -71,7 +71,7 @@ void ins_t::Run(){
     mesh.MassMatrixApply(o_u, o_MU);
 
     dlong Nentries = mesh.Nelements*mesh.Np*NVfields;
-    dfloat norm2 = sqrt(linAlg.innerProd(Nentries, o_u, o_MU, mesh.comm));
+    dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_u, o_MU, mesh.comm));
 
     if(mesh.rank==0)
       printf("Solution norm = %17.15lg\n", norm2);
diff --git a/solvers/ins/src/insSettings.cpp b/solvers/ins/src/insSettings.cpp
index 9010f13a6..ba63cdd2b 100644
--- a/solvers/ins/src/insSettings.cpp
+++ b/solvers/ins/src/insSettings.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,7 +27,7 @@ SOFTWARE.
 #include "ins.hpp"
 
 //settings for ins solver
-insSettings_t::insSettings_t(MPI_Comm& _comm):
+insSettings_t::insSettings_t(comm_t& _comm):
   settings_t(_comm) {
 
   newSetting("DATA FILE",
@@ -88,19 +88,16 @@ insSettings_t::insSettings_t(MPI_Comm& _comm):
 
   ellipticAddSettings(*this, "VELOCITY ");
   parAlmond::AddSettings(*this, "VELOCITY ");
-  initialGuessAddSettings(*this, "VELOCITY ");
+  InitialGuess::AddSettings(*this, "VELOCITY ");
 
   ellipticAddSettings(*this, "PRESSURE ");
   parAlmond::AddSettings(*this, "PRESSURE ");
-  initialGuessAddSettings(*this, "PRESSURE ");
+  InitialGuess::AddSettings(*this, "PRESSURE ");
 }
 
 void insSettings_t::report() {
 
-  int rank;
-  MPI_Comm_rank(comm, &rank);
-
-  if (rank==0) {
+  if (comm.rank()==0) {
     std::cout << "INS Settings:\n\n";
     reportSetting("DATA FILE");
     reportSetting("VISCOSITY");
@@ -167,15 +164,15 @@ void insSettings_t::report() {
 
 void insSettings_t::parseFromFile(platformSettings_t& platformSettings,
                                   meshSettings_t& meshSettings,
-                                  const string filename) {
+                                  const std::string filename) {
   //read all settings from file
   settings_t s(comm);
   s.readSettingsFromFile(filename);
 
   for(auto it = s.settings.begin(); it != s.settings.end(); ++it) {
-    setting_t* set = it->second;
-    const string name = set->getName();
-    const string val = set->getVal<string>();
+    setting_t& set = it->second;
+    const std::string name = set.getName();
+    const std::string val = set.getVal<std::string>();
     if (platformSettings.hasSetting(name))
       platformSettings.changeSetting(name, val);
     else if (meshSettings.hasSetting(name))
@@ -183,46 +180,44 @@ void insSettings_t::parseFromFile(platformSettings_t& platformSettings,
     else if (hasSetting(name)) //self
       changeSetting(name, val);
     else  {
-      stringstream ss;
-      ss << "Unknown setting: [" << name << "] requested";
-      LIBP_ABORT(ss.str());
+      LIBP_FORCE_ABORT("Unknown setting: [" << name << "] requested");
     }
   }
 }
 
-ellipticSettings_t* insSettings_t::extractVelocitySettings() {
+ellipticSettings_t insSettings_t::extractVelocitySettings() {
 
-  ellipticSettings_t* velocitySettings = new ellipticSettings_t(comm);
+  ellipticSettings_t velocitySettings(comm);
 
-  initialGuessAddSettings(*velocitySettings);
+  InitialGuess::AddSettings(velocitySettings);
 
-  for(auto it = velocitySettings->settings.begin(); it != velocitySettings->settings.end(); ++it) {
-    setting_t* set = it->second;
-    const string name = set->getName();
+  for(auto it = velocitySettings.settings.begin(); it != velocitySettings.settings.end(); ++it) {
+    setting_t& set = it->second;
+    const std::string name = set.getName();
 
-    string val;
+    std::string val;
     getSetting("VELOCITY "+name, val);
 
-    set->updateVal(val);
+    set.updateVal(val);
   }
 
   return velocitySettings;
 }
 
-ellipticSettings_t* insSettings_t::extractPressureSettings() {
+ellipticSettings_t insSettings_t::extractPressureSettings() {
 
-  ellipticSettings_t* pressureSettings = new ellipticSettings_t(comm);
+  ellipticSettings_t pressureSettings(comm);
 
-  initialGuessAddSettings(*pressureSettings);
+  InitialGuess::AddSettings(pressureSettings);
 
-  for(auto it = pressureSettings->settings.begin(); it != pressureSettings->settings.end(); ++it) {
-    setting_t* set = it->second;
-    const string name = set->getName();
+  for(auto it = pressureSettings.settings.begin(); it != pressureSettings.settings.end(); ++it) {
+    setting_t& set = it->second;
+    const std::string name = set.getName();
 
-    string val;
+    std::string val;
     getSetting("PRESSURE "+name, val);
 
-    set->updateVal(val);
+    set.updateVal(val);
   }
 
   return pressureSettings;
diff --git a/solvers/ins/src/insSetup.cpp b/solvers/ins/src/insSetup.cpp
index 5190e8ac1..c453105d3 100644
--- a/solvers/ins/src/insSetup.cpp
+++ b/solvers/ins/src/insSetup.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,23 +26,29 @@ SOFTWARE.
 
 #include "ins.hpp"
 
-ins_t& ins_t::Setup(platform_t& platform, mesh_t& mesh,
-                    insSettings_t& settings){
+void ins_t::Setup(platform_t& _platform, mesh_t& _mesh,
+                  insSettings_t& _settings){
 
-  ins_t* ins = new ins_t(platform, mesh, settings);
+  platform = _platform;
+  mesh = _mesh;
+  comm = _mesh.comm;
+  settings = _settings;
 
-  ins->NVfields = (mesh.dim==3) ? 3:2; // Total Number of Velocity Fields
-  ins->NTfields = (mesh.dim==3) ? 4:3; // Total Velocity + Pressure
+  //Trigger JIT kernel builds
+  ogs::InitializeKernels(platform, ogs::Dfloat, ogs::Add);
 
-  settings.getSetting("VISCOSITY", ins->nu);
+  NVfields = (mesh.dim==3) ? 3:2; // Total Number of Velocity Fields
+  NTfields = (mesh.dim==3) ? 4:3; // Total Velocity + Pressure
 
-  ins->cubature = (settings.compareSetting("ADVECTION TYPE", "CUBATURE")) ? 1:0;
-  ins->pressureIncrement = (settings.compareSetting("PRESSURE INCREMENT", "TRUE")) ? 1:0;
+  settings.getSetting("VISCOSITY", nu);
+
+  cubature = (settings.compareSetting("ADVECTION TYPE", "CUBATURE")) ? 1:0;
+  pressureIncrement = (settings.compareSetting("PRESSURE INCREMENT", "TRUE")) ? 1:0;
 
   //setup cubature
-  if (ins->cubature) {
+  if (cubature) {
     mesh.CubatureSetup();
-    mesh.CubatureNodes();
+    mesh.CubaturePhysicalNodes();
   }
 
   dlong Nlocal = mesh.Nelements*mesh.Np;
@@ -51,27 +57,22 @@ ins_t& ins_t::Setup(platform_t& platform, mesh_t& mesh,
   //setup timeStepper
   dfloat gamma = 0.0;
   if (settings.compareSetting("TIME INTEGRATOR","EXTBDF3")){
-    ins->timeStepper = new TimeStepper::extbdf3(mesh.Nelements, mesh.totalHaloPairs,
-                                              mesh.Np, ins->NVfields, *ins);
-    gamma = ((TimeStepper::extbdf3*) ins->timeStepper)->getGamma();
+    timeStepper.Setup<TimeStepper::extbdf3>(mesh.Nelements,
+                                            mesh.totalHaloPairs,
+                                            mesh.Np, NVfields, platform, comm);
+    gamma = timeStepper.GetGamma();
   } else if (settings.compareSetting("TIME INTEGRATOR","SSBDF3")){
-    ins->timeStepper = new TimeStepper::ssbdf3(mesh.Nelements, mesh.totalHaloPairs,
-                                              mesh.Np, ins->NVfields, *ins);
-    gamma = ((TimeStepper::ssbdf3*) ins->timeStepper)->getGamma();
+    timeStepper.Setup<TimeStepper::ssbdf3>(mesh.Nelements,
+                                           mesh.totalHaloPairs,
+                                           mesh.Np, NVfields, platform, comm);
+    gamma = timeStepper.GetGamma();
   }
 
-  ins->Nsubcycles=1;
+  Nsubcycles=1;
   if (settings.compareSetting("TIME INTEGRATOR","SSBDF3"))
-    settings.getSetting("NUMBER OF SUBCYCLES", ins->Nsubcycles);
+    settings.getSetting("NUMBER OF SUBCYCLES", Nsubcycles);
 
   //Setup velocity Elliptic solvers
-  ins->uSolver=NULL;
-  ins->vSolver=NULL;
-  ins->wSolver=NULL;
-  ins->uLinearSolver=NULL;
-  ins->vLinearSolver=NULL;
-  ins->wLinearSolver=NULL;
-
   dlong uNlocal=0, vNlocal=0, wNlocal=0;
   dlong uNhalo=0, vNhalo=0, wNhalo=0;
   if (settings.compareSetting("TIME INTEGRATOR","EXTBDF3")
@@ -85,273 +86,404 @@ ins_t& ins_t::Setup(platform_t& platform, mesh_t& mesh,
     // bc = 5 -> y-aligned slip
     // bc = 6 -> z-aligned slip
     int NBCTypes = 7;
-    int uBCType[NBCTypes] = {0,1,1,2,1,2,2}; // bc=3 => outflow => Neumann   => vBCType[3] = 2, etc.
-    int vBCType[NBCTypes] = {0,1,1,2,2,1,2}; // bc=3 => outflow => Neumann   => vBCType[3] = 2, etc.
-    int wBCType[NBCTypes] = {0,1,1,2,2,2,1}; // bc=3 => outflow => Neumann   => vBCType[3] = 2, etc.
-
-    ins->vSettings = settings.extractVelocitySettings();
+    memory<int> uBCType(NBCTypes);
+    // bc=3 => outflow => Neumann   => vBCType[3] = 2, etc.
+    uBCType[0] = 0;
+    uBCType[1] = 1;
+    uBCType[2] = 1;
+    uBCType[3] = 2;
+    uBCType[4] = 1;
+    uBCType[5] = 2;
+    uBCType[6] = 2;
+
+    memory<int> vBCType(NBCTypes);
+    // bc=3 => outflow => Neumann   => vBCType[3] = 2, etc.
+    vBCType[0] = 0;
+    vBCType[1] = 1;
+    vBCType[2] = 1;
+    vBCType[3] = 2;
+    vBCType[4] = 2;
+    vBCType[5] = 1;
+    vBCType[6] = 2;
+
+    memory<int> wBCType(NBCTypes);
+    // bc=3 => outflow => Neumann   => vBCType[3] = 2, etc.
+    wBCType[0] = 0;
+    wBCType[1] = 1;
+    wBCType[2] = 1;
+    wBCType[3] = 2;
+    wBCType[4] = 2;
+    wBCType[5] = 2;
+    wBCType[6] = 1;
+
+    vSettings = _settings.extractVelocitySettings();
 
     //make a guess at dt for the lambda value
     //TODO: we should allow preconditioners to be re-setup if lambda is updated
     dfloat hmin = mesh.MinCharacteristicLength();
-    dfloat dtAdvc = ins->Nsubcycles*hmin/((mesh.N+1.)*(mesh.N+1.));
-    dfloat lambda = gamma/(dtAdvc*ins->nu);
-    ins->uSolver = &(elliptic_t::Setup(platform, mesh, *(ins->vSettings),
-                                             lambda, NBCTypes, uBCType));
-    ins->vSolver = &(elliptic_t::Setup(platform, mesh, *(ins->vSettings),
-                                             lambda, NBCTypes, vBCType));
-    ins->wSolver = &(elliptic_t::Setup(platform, mesh, *(ins->vSettings),
-                                             lambda, NBCTypes, wBCType));
-    ins->vTau = ins->uSolver->tau;
-
-    ins->vDisc_c0 = settings.compareSetting("VELOCITY DISCRETIZATION", "CONTINUOUS") ? 1 : 0;
-
-    uNlocal = ins->uSolver->Ndofs;
-    vNlocal = ins->vSolver->Ndofs;
-    if (mesh.dim == 3) wNlocal = ins->wSolver->Ndofs;
-
-    uNhalo = ins->uSolver->Nhalo;
-    vNhalo = ins->vSolver->Nhalo;
-    if (mesh.dim == 3) wNhalo = ins->wSolver->Nhalo;
-
-    ins->uLinearSolver = initialGuessSolver_t::Setup(uNlocal, uNhalo,
-                                                    platform, *(ins->vSettings), mesh.comm);
-
-    ins->vLinearSolver = initialGuessSolver_t::Setup(vNlocal, vNhalo,
-                                                    platform, *(ins->vSettings), mesh.comm);
-    if (mesh.dim == 3) {
-      ins->wLinearSolver = initialGuessSolver_t::Setup(wNlocal, wNhalo,
-                                                       platform, *(ins->vSettings), mesh.comm);
+    dfloat dtAdvc = Nsubcycles*hmin/((mesh.N+1.)*(mesh.N+1.));
+    dfloat lambda = gamma/(dtAdvc*nu);
+    uSolver.Setup(platform, mesh, vSettings,
+                  lambda, NBCTypes, uBCType);
+    vSolver.Setup(platform, mesh, vSettings,
+                  lambda, NBCTypes, vBCType);
+    if (mesh.dim == 3)
+      wSolver.Setup(platform, mesh, vSettings,
+                    lambda, NBCTypes, wBCType);
+
+    vTau = uSolver.tau;
+
+    vDisc_c0 = settings.compareSetting("VELOCITY DISCRETIZATION", "CONTINUOUS") ? 1 : 0;
+
+    uNlocal = uSolver.Ndofs;
+    vNlocal = vSolver.Ndofs;
+    if (mesh.dim == 3) wNlocal = wSolver.Ndofs;
+
+    uNhalo = uSolver.Nhalo;
+    vNhalo = vSolver.Nhalo;
+    if (mesh.dim == 3) wNhalo = wSolver.Nhalo;
+
+    if (vSettings.compareSetting("LINEAR SOLVER","NBPCG")){
+
+      uLinearSolver.Setup<LinearSolver::nbpcg>(uNlocal, uNhalo, platform, vSettings, comm);
+      vLinearSolver.Setup<LinearSolver::nbpcg>(vNlocal, vNhalo, platform, vSettings, comm);
+      if (mesh.dim==3)
+        wLinearSolver.Setup<LinearSolver::nbpcg>(wNlocal, wNhalo, platform, vSettings, comm);
+
+    } else if (vSettings.compareSetting("LINEAR SOLVER","NBFPCG")){
+
+      uLinearSolver.Setup<LinearSolver::nbfpcg>(uNlocal, uNhalo, platform, vSettings, comm);
+      vLinearSolver.Setup<LinearSolver::nbfpcg>(vNlocal, vNhalo, platform, vSettings, comm);
+      if (mesh.dim==3)
+        wLinearSolver.Setup<LinearSolver::nbfpcg>(wNlocal, wNhalo, platform, vSettings, comm);
+
+    } else if (vSettings.compareSetting("LINEAR SOLVER","PCG")){
+
+      uLinearSolver.Setup<LinearSolver::pcg>(uNlocal, uNhalo, platform, vSettings, comm);
+      vLinearSolver.Setup<LinearSolver::pcg>(vNlocal, vNhalo, platform, vSettings, comm);
+      if (mesh.dim==3)
+        wLinearSolver.Setup<LinearSolver::pcg>(wNlocal, wNhalo, platform, vSettings, comm);
+
+    } else if (vSettings.compareSetting("LINEAR SOLVER","PGMRES")){
+
+      uLinearSolver.Setup<LinearSolver::pgmres>(uNlocal, uNhalo, platform, vSettings, comm);
+      vLinearSolver.Setup<LinearSolver::pgmres>(vNlocal, vNhalo, platform, vSettings, comm);
+      if (mesh.dim==3)
+        wLinearSolver.Setup<LinearSolver::pgmres>(wNlocal, wNhalo, platform, vSettings, comm);
+
+    } else if (vSettings.compareSetting("LINEAR SOLVER","PMINRES")){
+
+      uLinearSolver.Setup<LinearSolver::pminres>(uNlocal, uNhalo, platform, vSettings, comm);
+      vLinearSolver.Setup<LinearSolver::pminres>(vNlocal, vNhalo, platform, vSettings, comm);
+      if (mesh.dim==3)
+        wLinearSolver.Setup<LinearSolver::pminres>(wNlocal, wNhalo, platform, vSettings, comm);
+    }
+
+    if (vSettings.compareSetting("INITIAL GUESS STRATEGY", "NONE")) {
+
+      uLinearSolver.SetupInitialGuess<InitialGuess::Default>(uNlocal, platform, vSettings, comm);
+      vLinearSolver.SetupInitialGuess<InitialGuess::Default>(vNlocal, platform, vSettings, comm);
+      if (mesh.dim==3)
+        wLinearSolver.SetupInitialGuess<InitialGuess::Default>(wNlocal, platform, vSettings, comm);
+
+    } else if (vSettings.compareSetting("INITIAL GUESS STRATEGY", "ZERO")) {
+
+      uLinearSolver.SetupInitialGuess<InitialGuess::Zero>(uNlocal, platform, vSettings, comm);
+      vLinearSolver.SetupInitialGuess<InitialGuess::Zero>(vNlocal, platform, vSettings, comm);
+      if (mesh.dim==3)
+        wLinearSolver.SetupInitialGuess<InitialGuess::Zero>(wNlocal, platform, vSettings, comm);
+
+    } else if (vSettings.compareSetting("INITIAL GUESS STRATEGY", "CLASSIC")) {
+
+      uLinearSolver.SetupInitialGuess<InitialGuess::ClassicProjection>(uNlocal, platform, vSettings, comm);
+      vLinearSolver.SetupInitialGuess<InitialGuess::ClassicProjection>(vNlocal, platform, vSettings, comm);
+      if (mesh.dim==3)
+        wLinearSolver.SetupInitialGuess<InitialGuess::ClassicProjection>(wNlocal, platform, vSettings, comm);
+
+    } else if (vSettings.compareSetting("INITIAL GUESS STRATEGY", "QR")) {
+
+      uLinearSolver.SetupInitialGuess<InitialGuess::RollingQRProjection>(uNlocal, platform, vSettings, comm);
+      vLinearSolver.SetupInitialGuess<InitialGuess::RollingQRProjection>(vNlocal, platform, vSettings, comm);
+      if (mesh.dim==3)
+        wLinearSolver.SetupInitialGuess<InitialGuess::RollingQRProjection>(wNlocal, platform, vSettings, comm);
+
+    } else if (vSettings.compareSetting("INITIAL GUESS STRATEGY", "EXTRAP")) {
+
+      uLinearSolver.SetupInitialGuess<InitialGuess::Extrap>(uNlocal, platform, vSettings, comm);
+      vLinearSolver.SetupInitialGuess<InitialGuess::Extrap>(vNlocal, platform, vSettings, comm);
+      if (mesh.dim==3)
+        wLinearSolver.SetupInitialGuess<InitialGuess::Extrap>(wNlocal, platform, vSettings, comm);
+
     }
 
   } else {
-    ins->vDisc_c0 = 0;
+    vDisc_c0 = 0;
 
     //set penalty
-    if (mesh.elementType==TRIANGLES ||
-        mesh.elementType==QUADRILATERALS){
-      ins->vTau = 2.0*(mesh.N+1)*(mesh.N+2)/2.0;
+    if (mesh.elementType==Mesh::TRIANGLES ||
+        mesh.elementType==Mesh::QUADRILATERALS){
+      vTau = 2.0*(mesh.N+1)*(mesh.N+2)/2.0;
       if(mesh.dim==3)
-        ins->vTau *= 1.5;
+        vTau *= 1.5;
     } else
-      ins->vTau = 2.0*(mesh.N+1)*(mesh.N+3);
+      vTau = 2.0*(mesh.N+1)*(mesh.N+3);
   }
 
   //Setup pressure Elliptic solver
   dlong pNlocal=0, pNhalo=0;
   {
     int NBCTypes = 7;
-    int pBCType[NBCTypes] = {0,2,2,1,2,2,2}; // bc=3 => outflow => Dirichlet => pBCType[3] = 1, etc.
-
-    ins->pSettings = settings.extractPressureSettings();
-    ins->pSolver = &(elliptic_t::Setup(platform, mesh, *(ins->pSettings),
-                                             0.0, NBCTypes, pBCType));
-    ins->pTau = ins->pSolver->tau;
-
-    ins->pDisc_c0 = settings.compareSetting("PRESSURE DISCRETIZATION", "CONTINUOUS") ? 1 : 0;
-
-    if (ins->pDisc_c0) {
-      pNlocal = ins->pSolver->ogsMasked->Ngather;
-      pNhalo  = ins->pSolver->ogsMasked->NgatherHalo;
+    memory<int> pBCType(NBCTypes);
+    // bc=3 => outflow => Dirichlet => pBCType[3] = 1, etc.
+    pBCType[0] = 0;
+    pBCType[1] = 2;
+    pBCType[2] = 2;
+    pBCType[3] = 1;
+    pBCType[4] = 2;
+    pBCType[5] = 2;
+    pBCType[6] = 2;
+
+    pSettings = _settings.extractPressureSettings();
+    pSolver.Setup(platform, mesh, pSettings,
+                  0.0, NBCTypes, pBCType);
+    pTau = pSolver.tau;
+
+    pDisc_c0 = settings.compareSetting("PRESSURE DISCRETIZATION", "CONTINUOUS") ? 1 : 0;
+
+    if (pDisc_c0) {
+      pNlocal = pSolver.ogsMasked.Ngather;
+      pNhalo  = pSolver.gHalo.Nhalo;
     } else {
       pNlocal = mesh.Nelements*mesh.Np;
       pNhalo  = mesh.totalHaloPairs*mesh.Np;
     }
 
-    ins->pLinearSolver = initialGuessSolver_t::Setup(pNlocal, pNhalo,
-                                                     platform, *(ins->pSettings), mesh.comm);
+    if (vSettings.compareSetting("LINEAR SOLVER","NBPCG")){
+      pLinearSolver.Setup<LinearSolver::nbpcg>(pNlocal, pNhalo, platform, pSettings, comm);
+    } else if (pSettings.compareSetting("LINEAR SOLVER","NBFPCG")){
+      pLinearSolver.Setup<LinearSolver::nbfpcg>(pNlocal, pNhalo, platform, pSettings, comm);
+    } else if (pSettings.compareSetting("LINEAR SOLVER","PCG")){
+      pLinearSolver.Setup<LinearSolver::pcg>(pNlocal, pNhalo, platform, pSettings, comm);
+    } else if (pSettings.compareSetting("LINEAR SOLVER","PGMRES")){
+      pLinearSolver.Setup<LinearSolver::pgmres>(pNlocal, pNhalo, platform, pSettings, comm);
+    } else if (pSettings.compareSetting("LINEAR SOLVER","PMINRES")){
+      pLinearSolver.Setup<LinearSolver::pminres>(pNlocal, pNhalo, platform, pSettings, comm);
+    }
+
+    if (pSettings.compareSetting("INITIAL GUESS STRATEGY", "NONE")) {
+      pLinearSolver.SetupInitialGuess<InitialGuess::Default>(pNlocal, platform, pSettings, comm);
+    } else if (pSettings.compareSetting("INITIAL GUESS STRATEGY", "ZERO")) {
+      pLinearSolver.SetupInitialGuess<InitialGuess::Zero>(pNlocal, platform, pSettings, comm);
+    } else if (pSettings.compareSetting("INITIAL GUESS STRATEGY", "CLASSIC")) {
+      pLinearSolver.SetupInitialGuess<InitialGuess::ClassicProjection>(pNlocal, platform, pSettings, comm);
+    } else if (pSettings.compareSetting("INITIAL GUESS STRATEGY", "QR")) {
+      pLinearSolver.SetupInitialGuess<InitialGuess::RollingQRProjection>(pNlocal, platform, pSettings, comm);
+    } else if (pSettings.compareSetting("INITIAL GUESS STRATEGY", "EXTRAP")) {
+      pLinearSolver.SetupInitialGuess<InitialGuess::Extrap>(pNlocal, platform, pSettings, comm);
+    }
   }
 
   //Solver tolerances
-  ins->presTOL = 1E-8;
-  ins->velTOL  = 1E-8;
-
-  //build node-wise boundary flag
-  ins->BoundarySetup();
+  if (sizeof(dfloat)==sizeof(double)) {
+    presTOL = 1.0E-8;
+    velTOL  = 1.0E-8;
+  } else {
+    presTOL = 1.0E-5;
+    velTOL  = 1.0E-5;
+  }
 
   //setup linear algebra module
-  platform.linAlg.InitKernels({"innerProd", "axpy", "max"});
+  platform.linAlg().InitKernels({"innerProd", "axpy", "max"});
 
   /*setup trace halo exchange */
-  ins->pTraceHalo = mesh.HaloTraceSetup(1); //one field
-  ins->vTraceHalo = mesh.HaloTraceSetup(ins->NVfields); //one field
+  pTraceHalo = mesh.HaloTraceSetup(1); //one field
+  vTraceHalo = mesh.HaloTraceSetup(NVfields); //one field
 
   // u and p at interpolation nodes
-  ins->u = (dfloat*) calloc((Nlocal+Nhalo)*ins->NVfields, sizeof(dfloat));
-  ins->o_u = platform.malloc((Nlocal+Nhalo)*ins->NVfields*sizeof(dfloat), ins->u);
+  u.malloc((Nlocal+Nhalo)*NVfields, 0.0);
+  o_u = platform.malloc<dfloat>(u);
 
-  ins->p = (dfloat*) calloc(Nlocal+Nhalo, sizeof(dfloat));
-  ins->o_p = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), ins->p);
+  p.malloc(Nlocal+Nhalo, 0.0);
+  o_p = platform.malloc<dfloat>(p);
 
   //storage for velocity gradient
   if ( !settings.compareSetting("TIME INTEGRATOR","EXTBDF3")
     && !settings.compareSetting("TIME INTEGRATOR","SSBDF3"))
-    ins->o_GU = platform.malloc((Nlocal+Nhalo)*4*sizeof(dfloat));
+    o_GU = platform.malloc<dfloat>((Nlocal+Nhalo)*4);
 
   //extra buffers for solvers
   if (settings.compareSetting("TIME INTEGRATOR","EXTBDF3")
     ||settings.compareSetting("TIME INTEGRATOR","SSBDF3")) {
-    ins->o_UH = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat));
-    ins->o_VH = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat));
+    o_UH = platform.malloc<dfloat>(Nlocal+Nhalo, u);
+    o_VH = platform.malloc<dfloat>(Nlocal+Nhalo, u);
     if (mesh.dim==3)
-      ins->o_WH = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat));
-    else
-      ins->o_WH = platform.malloc((1)*sizeof(dfloat));
+      o_WH = platform.malloc<dfloat>(Nlocal+Nhalo, u);
 
-    ins->o_rhsU = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat));
-    ins->o_rhsV = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat));
+    o_rhsU = platform.malloc<dfloat>(Nlocal+Nhalo, u);
+    o_rhsV = platform.malloc<dfloat>(Nlocal+Nhalo, u);
     if (mesh.dim==3)
-      ins->o_rhsW = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat));
-    else
-      ins->o_rhsW = platform.malloc((1)*sizeof(dfloat));
+      o_rhsW = platform.malloc<dfloat>(Nlocal+Nhalo, u);
 
-    if (ins->vDisc_c0) {
-      ins->o_GUH = platform.malloc((uNlocal+uNhalo)*sizeof(dfloat), ins->u);
-      ins->o_GVH = platform.malloc((vNlocal+vNhalo)*sizeof(dfloat), ins->u);
+    if (vDisc_c0) {
+      o_GUH = platform.malloc<dfloat>(uNlocal+uNhalo, u);
+      o_GVH = platform.malloc<dfloat>(vNlocal+vNhalo, u);
       if (mesh.dim==3)
-        ins->o_GWH = platform.malloc((wNlocal+wNhalo)*sizeof(dfloat), ins->u);
+        o_GWH = platform.malloc<dfloat>(wNlocal+wNhalo, u);
 
-      ins->o_GrhsU = platform.malloc((uNlocal+uNhalo)*sizeof(dfloat));
-      ins->o_GrhsV = platform.malloc((vNlocal+vNhalo)*sizeof(dfloat));
+      o_GrhsU = platform.malloc<dfloat>(uNlocal+uNhalo, u);
+      o_GrhsV = platform.malloc<dfloat>(vNlocal+vNhalo, u);
       if (mesh.dim==3)
-        ins->o_GrhsW = platform.malloc((wNlocal+wNhalo)*sizeof(dfloat));
+        o_GrhsW = platform.malloc<dfloat>(wNlocal+wNhalo, u);
     }
   }
 
-  if (ins->pressureIncrement) {
-    ins->o_PI = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), ins->p);
-    ins->o_GPI = platform.malloc((pNlocal+pNhalo)*sizeof(dfloat), ins->p);
+  if (pressureIncrement) {
+    o_PI  = platform.malloc<dfloat>(p);
+    o_GPI = platform.malloc<dfloat>(p);
   }
 
-  ins->o_rhsP = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat));
-  if (ins->pDisc_c0) {
-    ins->o_GP    = platform.malloc((pNlocal+pNhalo)*sizeof(dfloat), ins->p);
-    ins->o_GrhsP = platform.malloc((pNlocal+pNhalo)*sizeof(dfloat));
+  o_rhsP = platform.malloc<dfloat>(p);
+  if (pDisc_c0) {
+    o_GP    = platform.malloc<dfloat>(p);
+    o_GrhsP = platform.malloc<dfloat>(p);
   }
 
   //storage for M*u during reporting
-  ins->o_MU = platform.malloc((Nlocal+Nhalo)*ins->NVfields*sizeof(dfloat), ins->u);
-  mesh.MassMatrixKernelSetup(ins->NVfields); // mass matrix operator
+  o_MU = platform.malloc<dfloat>(u);
+  mesh.MassMatrixKernelSetup(NVfields); // mass matrix operator
 
   if (mesh.dim==2) {
-    ins->Vort = (dfloat*) calloc((Nlocal+Nhalo), sizeof(dfloat));
-    ins->o_Vort = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), ins->Vort);
+    Vort.malloc(Nlocal+Nhalo, 0.0);
+    o_Vort = platform.malloc<dfloat>(Vort);
   } else {
-    ins->Vort = (dfloat*) calloc((Nlocal+Nhalo)*ins->NVfields, sizeof(dfloat));
-    ins->o_Vort = platform.malloc((Nlocal+Nhalo)*ins->NVfields*sizeof(dfloat), ins->Vort);
+    Vort.malloc((Nlocal+Nhalo)*NVfields, 0.0);
+    o_Vort = platform.malloc<dfloat>(Vort);
   }
 
   // OCCA build stuff
-  occa::properties kernelInfo = mesh.props; //copy base occa properties
+  properties_t kernelInfo = mesh.props; //copy base occa properties
 
   //add boundary data to kernel info
-  string dataFileName;
+  std::string dataFileName;
   settings.getSetting("DATA FILE", dataFileName);
   kernelInfo["includes"] += dataFileName;
 
-  kernelInfo["defines/" "p_Nfields"] = ins->NVfields;
-  kernelInfo["defines/" "p_NVfields"]= ins->NVfields;
-  kernelInfo["defines/" "p_NTfields"]= ins->NTfields;
+  kernelInfo["defines/" "p_Nfields"] = NVfields;
+  kernelInfo["defines/" "p_NVfields"]= NVfields;
+  kernelInfo["defines/" "p_NTfields"]= NTfields;
 
-  int maxNodes = mymax(mesh.Np, (mesh.Nfp*mesh.Nfaces));
+  int maxNodes = std::max(mesh.Np, (mesh.Nfp*mesh.Nfaces));
   kernelInfo["defines/" "p_maxNodes"]= maxNodes;
 
   int blockMax = 256;
 
-  int NblockV = mymax(1,blockMax/mesh.Np);
+  int NblockV = std::max(1,blockMax/mesh.Np);
   kernelInfo["defines/" "p_NblockV"]= NblockV;
 
-  int NblockS = mymax(1,blockMax/maxNodes);
+  int NblockS = std::max(1,blockMax/maxNodes);
   kernelInfo["defines/" "p_NblockS"]= NblockS;
 
-  if (ins->cubature) {
-    int cubMaxNodes = mymax(mesh.Np, (mesh.intNfp*mesh.Nfaces));
+  if (cubature) {
+    int cubMaxNodes = std::max(mesh.Np, (mesh.intNfp*mesh.Nfaces));
     kernelInfo["defines/" "p_cubMaxNodes"]= cubMaxNodes;
-    int cubMaxNodes1 = mymax(mesh.Np, (mesh.intNfp));
+    int cubMaxNodes1 = std::max(mesh.Np, (mesh.intNfp));
     kernelInfo["defines/" "p_cubMaxNodes1"]= cubMaxNodes1;
 
-    int cubNblockV = mymax(1,blockMax/mesh.cubNp);
+    int cubNblockV = std::max(1,blockMax/mesh.cubNp);
     kernelInfo["defines/" "p_cubNblockV"]= cubNblockV;
 
-    int cubNblockS = mymax(1,blockMax/cubMaxNodes);
+    int cubNblockS = std::max(1,blockMax/cubMaxNodes);
     kernelInfo["defines/" "p_cubNblockS"]= cubNblockS;
   }
 
-  kernelInfo["parser/" "automate-add-barriers"] =  "disabled";
-
   // set kernel name suffix
-  char *suffix;
-  if(mesh.elementType==TRIANGLES)
-    suffix = strdup("Tri2D");
-  if(mesh.elementType==QUADRILATERALS)
-    suffix = strdup("Quad2D");
-  if(mesh.elementType==TETRAHEDRA)
-    suffix = strdup("Tet3D");
-  if(mesh.elementType==HEXAHEDRA)
-    suffix = strdup("Hex3D");
-
-  char fileName[BUFSIZ], kernelName[BUFSIZ];
+  std::string suffix;
+  if(mesh.elementType==Mesh::TRIANGLES)
+    suffix = "Tri2D";
+  if(mesh.elementType==Mesh::QUADRILATERALS)
+    suffix = "Quad2D";
+  if(mesh.elementType==Mesh::TETRAHEDRA)
+    suffix = "Tet3D";
+  if(mesh.elementType==Mesh::HEXAHEDRA)
+    suffix = "Hex3D";
+
+  std::string oklFilePrefix = DINS "/okl/";
+  std::string oklFileSuffix = ".okl";
+
+  std::string fileName, kernelName;
 
   // advection kernels
-  ins->subcycler=NULL;
-  ins->subStepper=NULL;
   if (settings.compareSetting("TIME INTEGRATOR","SSBDF3")) {
     //subcycle kernels
-    if (ins->cubature) {
-      sprintf(fileName, DINS "/okl/insSubcycleCubatureAdvection%s.okl", suffix);
-      sprintf(kernelName, "insSubcycleAdvectionCubatureVolume%s", suffix);
-      ins->advectionVolumeKernel =  platform.buildKernel(fileName, kernelName,
+    if (cubature) {
+      fileName   = oklFilePrefix + "insSubcycleCubatureAdvection" + suffix + oklFileSuffix;
+      kernelName = "insSubcycleAdvectionCubatureVolume" + suffix;
+      advectionVolumeKernel =  platform.buildKernel(fileName, kernelName,
                                              kernelInfo);
-      sprintf(kernelName, "insSubcycleAdvectionCubatureSurface%s", suffix);
-      ins->advectionSurfaceKernel = platform.buildKernel(fileName, kernelName,
+      kernelName = "insSubcycleAdvectionCubatureSurface" + suffix;
+      advectionSurfaceKernel = platform.buildKernel(fileName, kernelName,
                                              kernelInfo);
     } else {
-      sprintf(fileName, DINS "/okl/insSubcycleAdvection%s.okl", suffix);
-      sprintf(kernelName, "insSubcycleAdvectionVolume%s", suffix);
-      ins->advectionVolumeKernel =  platform.buildKernel(fileName, kernelName,
+      fileName   = oklFilePrefix + "insSubcycleAdvection" + suffix + oklFileSuffix;
+      kernelName = "insSubcycleAdvectionVolume" + suffix;
+      advectionVolumeKernel =  platform.buildKernel(fileName, kernelName,
                                              kernelInfo);
-      sprintf(kernelName, "insSubcycleAdvectionSurface%s", suffix);
-      ins->advectionSurfaceKernel = platform.buildKernel(fileName, kernelName,
+      kernelName = "insSubcycleAdvectionSurface" + suffix;
+      advectionSurfaceKernel = platform.buildKernel(fileName, kernelName,
                                              kernelInfo);
     }
 
     //build subcycler
-    ins->subcycler  = new subcycler_t(*ins);
+    subcycler.platform = platform;
+    subcycler.mesh = mesh;
+    subcycler.comm = comm;
+    subcycler.settings = settings;
+
+    subcycler.NVfields = NVfields;
+    subcycler.nu = nu;
+    subcycler.cubature = cubature;
+    subcycler.vTraceHalo = vTraceHalo;
+    subcycler.advectionVolumeKernel = advectionVolumeKernel;
+    subcycler.advectionSurfaceKernel = advectionSurfaceKernel;
+
     if (settings.compareSetting("SUBCYCLING TIME INTEGRATOR","AB3")){
-      ins->subStepper = new TimeStepper::ab3(mesh.Nelements, mesh.totalHaloPairs,
-                                                mesh.Np, ins->NVfields, *(ins->subcycler));
+      subStepper.Setup<TimeStepper::ab3>(mesh.Nelements,
+                                         mesh.totalHaloPairs,
+                                         mesh.Np, NVfields, platform, comm);
     } else if (settings.compareSetting("SUBCYCLING TIME INTEGRATOR","LSERK4")){
-      ins->subStepper = new TimeStepper::lserk4(mesh.Nelements, mesh.totalHaloPairs,
-                                                mesh.Np, ins->NVfields, *(ins->subcycler));
+      subStepper.Setup<TimeStepper::lserk4>(mesh.Nelements,
+                                            mesh.totalHaloPairs,
+                                            mesh.Np, NVfields, platform, comm);
     } else if (settings.compareSetting("SUBCYCLING TIME INTEGRATOR","DOPRI5")){
-      ins->subStepper = new TimeStepper::dopri5(mesh.Nelements, mesh.totalHaloPairs,
-                                                mesh.Np, ins->NVfields, *(ins->subcycler), mesh.comm);
+      subStepper.Setup<TimeStepper::dopri5>(mesh.Nelements,
+                                            mesh.totalHaloPairs,
+                                            mesh.Np, NVfields, platform, comm);
     }
 
-    sprintf(fileName, DINS "/okl/insSubcycleAdvection.okl");
-    sprintf(kernelName, "insSubcycleAdvectionKernel");
-    ins->subcycler->subCycleAdvectionKernel = platform.buildKernel(fileName, kernelName,
+    fileName   = oklFilePrefix + "insSubcycleAdvection" + oklFileSuffix;
+    kernelName = "insSubcycleAdvectionKernel";
+    subcycler.subCycleAdvectionKernel = platform.buildKernel(fileName, kernelName,
                                              kernelInfo);
 
-    ins->subcycler->o_Ue = platform.malloc((Nlocal+Nhalo)*ins->NVfields*sizeof(dfloat), ins->u);
+    subcycler.o_Ue = platform.malloc<dfloat>(u);
 
   } else {
     //regular advection kernels
-    if (ins->cubature) {
-      sprintf(fileName, DINS "/okl/insCubatureAdvection%s.okl", suffix);
-      sprintf(kernelName, "insAdvectionCubatureVolume%s", suffix);
-      ins->advectionVolumeKernel =  platform.buildKernel(fileName, kernelName,
+    if (cubature) {
+      fileName   = oklFilePrefix + "insCubatureAdvection" + suffix + oklFileSuffix;
+      kernelName = "insAdvectionCubatureVolume" + suffix;
+      advectionVolumeKernel =  platform.buildKernel(fileName, kernelName,
                                              kernelInfo);
-      sprintf(kernelName, "insAdvectionCubatureSurface%s", suffix);
-      ins->advectionSurfaceKernel = platform.buildKernel(fileName, kernelName,
+      kernelName = "insAdvectionCubatureSurface" + suffix;
+      advectionSurfaceKernel = platform.buildKernel(fileName, kernelName,
                                              kernelInfo);
     } else {
-      sprintf(fileName, DINS "/okl/insAdvection%s.okl", suffix);
-      sprintf(kernelName, "insAdvectionVolume%s", suffix);
-      ins->advectionVolumeKernel =  platform.buildKernel(fileName, kernelName,
+      fileName   = oklFilePrefix + "insAdvection" + suffix + oklFileSuffix;
+      kernelName = "insAdvectionVolume" + suffix;
+      advectionVolumeKernel =  platform.buildKernel(fileName, kernelName,
                                              kernelInfo);
-      sprintf(kernelName, "insAdvectionSurface%s", suffix);
-      ins->advectionSurfaceKernel = platform.buildKernel(fileName, kernelName,
+      kernelName = "insAdvectionSurface" + suffix;
+      advectionSurfaceKernel = platform.buildKernel(fileName, kernelName,
                                              kernelInfo);
     }
   }
@@ -359,133 +491,95 @@ ins_t& ins_t::Setup(platform_t& platform, mesh_t& mesh,
   // diffusion kernels
   if (settings.compareSetting("TIME INTEGRATOR","EXTBDF3")
     ||settings.compareSetting("TIME INTEGRATOR","SSBDF3")) {
-    sprintf(fileName, DINS "/okl/insVelocityRhs%s.okl", suffix);
+    fileName   = oklFilePrefix + "insVelocityRhs" + suffix + oklFileSuffix;
 
-    if (ins->vDisc_c0)
-      sprintf(kernelName, "insVelocityRhs%s", suffix);
+    if (vDisc_c0)
+      kernelName = "insVelocityRhs" + suffix;
     else
-      sprintf(kernelName, "insVelocityIpdgRhs%s", suffix);
-    ins->velocityRhsKernel =  platform.buildKernel(fileName, kernelName,
+      kernelName = "insVelocityIpdgRhs" + suffix;
+    velocityRhsKernel =  platform.buildKernel(fileName, kernelName,
                                            kernelInfo);
 
-    sprintf(kernelName, "insVelocityBC%s", suffix);
-    ins->velocityBCKernel =  platform.buildKernel(fileName, kernelName,
+    kernelName = "insVelocityBC" + suffix;
+    velocityBCKernel =  platform.buildKernel(fileName, kernelName,
                                            kernelInfo);
   } else {
     // gradient kernel
-    sprintf(fileName, DINS "/okl/insVelocityGradient%s.okl", suffix);
-    sprintf(kernelName, "insVelocityGradient%s", suffix);
-    ins->velocityGradientKernel =  platform.buildKernel(fileName, kernelName,
+    fileName   = oklFilePrefix + "insVelocityGradient" + suffix + oklFileSuffix;
+    kernelName = "insVelocityGradient" + suffix;
+    velocityGradientKernel =  platform.buildKernel(fileName, kernelName,
                                                kernelInfo);
 
-    sprintf(fileName, DINS "/okl/insDiffusion%s.okl", suffix);
-    sprintf(kernelName, "insDiffusion%s", suffix);
-    ins->diffusionKernel =  platform.buildKernel(fileName, kernelName,
+    fileName   = oklFilePrefix + "insDiffusion" + suffix + oklFileSuffix;
+    kernelName = "insDiffusion" + suffix;
+    diffusionKernel =  platform.buildKernel(fileName, kernelName,
                                            kernelInfo);
   }
 
   //pressure gradient kernels
-  sprintf(fileName, DINS "/okl/insGradient%s.okl", suffix);
-  sprintf(kernelName, "insGradientVolume%s", suffix);
-  ins->gradientVolumeKernel =  platform.buildKernel(fileName, kernelName,
+  fileName   = oklFilePrefix + "insGradient" + suffix + oklFileSuffix;
+  kernelName = "insGradientVolume" + suffix;
+  gradientVolumeKernel =  platform.buildKernel(fileName, kernelName,
                                          kernelInfo);
-  sprintf(kernelName, "insGradientSurface%s", suffix);
-  ins->gradientSurfaceKernel = platform.buildKernel(fileName, kernelName,
+  kernelName = "insGradientSurface" + suffix;
+  gradientSurfaceKernel = platform.buildKernel(fileName, kernelName,
                                          kernelInfo);
 
   //velocity divergence kernels
-  sprintf(fileName, DINS "/okl/insDivergence%s.okl", suffix);
-  sprintf(kernelName, "insDivergenceVolume%s", suffix);
-  ins->divergenceVolumeKernel =  platform.buildKernel(fileName, kernelName,
+  fileName   = oklFilePrefix + "insDivergence" + suffix + oklFileSuffix;
+  kernelName = "insDivergenceVolume" + suffix;
+  divergenceVolumeKernel =  platform.buildKernel(fileName, kernelName,
                                          kernelInfo);
-  sprintf(kernelName, "insDivergenceSurface%s", suffix);
-  ins->divergenceSurfaceKernel = platform.buildKernel(fileName, kernelName,
+  kernelName = "insDivergenceSurface" + suffix;
+  divergenceSurfaceKernel = platform.buildKernel(fileName, kernelName,
                                          kernelInfo);
 
   //pressure solver kernels
-  if (ins->pressureIncrement) {
-    sprintf(fileName, DINS "/okl/insPressureIncrementRhs%s.okl", suffix);
+  if (pressureIncrement) {
+    fileName   = oklFilePrefix + "insPressureIncrementRhs" + suffix + oklFileSuffix;
 
-    if (ins->pDisc_c0)
-      sprintf(kernelName, "insPressureIncrementRhs%s", suffix);
+    if (pDisc_c0)
+      kernelName = "insPressureIncrementRhs" + suffix;
     else
-      sprintf(kernelName, "insPressureIncrementIpdgRhs%s", suffix);
-    ins->pressureIncrementRhsKernel =  platform.buildKernel(fileName, kernelName,
+      kernelName = "insPressureIncrementIpdgRhs" + suffix;
+    pressureIncrementRhsKernel =  platform.buildKernel(fileName, kernelName,
                                            kernelInfo);
 
-    sprintf(kernelName, "insPressureIncrementBC%s", suffix);
-    ins->pressureIncrementBCKernel =  platform.buildKernel(fileName, kernelName,
+    kernelName = "insPressureIncrementBC" + suffix;
+    pressureIncrementBCKernel =  platform.buildKernel(fileName, kernelName,
                                            kernelInfo);
   } else {
-    sprintf(fileName, DINS "/okl/insPressureRhs%s.okl", suffix);
-    if (ins->pDisc_c0)
-      sprintf(kernelName, "insPressureRhs%s", suffix);
+    fileName   = oklFilePrefix + "insPressureRhs" + suffix + oklFileSuffix;
+    if (pDisc_c0)
+      kernelName = "insPressureRhs" + suffix;
     else
-      sprintf(kernelName, "insPressureIpdgRhs%s", suffix);
-    ins->pressureRhsKernel =  platform.buildKernel(fileName, kernelName,
+      kernelName = "insPressureIpdgRhs" + suffix;
+    pressureRhsKernel =  platform.buildKernel(fileName, kernelName,
                                            kernelInfo);
 
-    sprintf(kernelName, "insPressureBC%s", suffix);
-    ins->pressureBCKernel =  platform.buildKernel(fileName, kernelName,
+    kernelName = "insPressureBC" + suffix;
+    pressureBCKernel =  platform.buildKernel(fileName, kernelName,
                                            kernelInfo);
   }
 
-  sprintf(fileName, DINS "/okl/insVorticity%s.okl", suffix);
-  sprintf(kernelName, "insVorticity%s", suffix);
-  ins->vorticityKernel =  platform.buildKernel(fileName, kernelName,
+  fileName   = oklFilePrefix + "insVorticity" + suffix + oklFileSuffix;
+  kernelName = "insVorticity" + suffix;
+  vorticityKernel =  platform.buildKernel(fileName, kernelName,
                                             kernelInfo);
 
   if (mesh.dim==2) {
-    sprintf(fileName, DINS "/okl/insInitialCondition2D.okl");
-    sprintf(kernelName, "insInitialCondition2D");
+    fileName   = oklFilePrefix + "insInitialCondition2D" + oklFileSuffix;
+    kernelName = "insInitialCondition2D";
   } else {
-    sprintf(fileName, DINS "/okl/insInitialCondition3D.okl");
-    sprintf(kernelName, "insInitialCondition3D");
+    fileName   = oklFilePrefix + "insInitialCondition3D" + oklFileSuffix;
+    kernelName = "insInitialCondition3D";
   }
 
-  ins->initialConditionKernel = platform.buildKernel(fileName, kernelName,
+  initialConditionKernel = platform.buildKernel(fileName, kernelName,
                                                   kernelInfo);
 
-  sprintf(fileName, DINS "/okl/insMaxWaveSpeed%s.okl", suffix);
-  sprintf(kernelName, "insMaxWaveSpeed%s", suffix);
-
-  ins->maxWaveSpeedKernel = platform.buildKernel(fileName, kernelName, kernelInfo);
-
-  return *ins;
-}
-
-ins_t::~ins_t() {
-  advectionVolumeKernel.free();
-  advectionSurfaceKernel.free();
-  divergenceVolumeKernel.free();
-  divergenceSurfaceKernel.free();
-  gradientVolumeKernel.free();
-  gradientSurfaceKernel.free();
-  velocityGradientKernel.free();
-  diffusionKernel.free();
-  velocityRhsKernel.free();
-  velocityBCKernel.free();
-  pressureRhsKernel.free();
-  pressureBCKernel.free();
-  vorticityKernel.free();
-  initialConditionKernel.free();
-  maxWaveSpeedKernel.free();
-
-  if (pSolver) delete pSolver;
-  if (uSolver) delete uSolver;
-  if (vSolver) delete vSolver;
-  if (wSolver) delete wSolver;
-  if (timeStepper) delete timeStepper;
-  if (pLinearSolver) delete pLinearSolver;
-  if (uLinearSolver) delete uLinearSolver;
-  if (vLinearSolver) delete vLinearSolver;
-  if (wLinearSolver) delete wLinearSolver;
-  if (subStepper) delete subStepper;
-  if (subcycler) {
-    subcycler->subCycleAdvectionKernel.free();
-    delete subcycler;
-  }
+  fileName   = oklFilePrefix + "insMaxWaveSpeed" + suffix + oklFileSuffix;
+  kernelName = "insMaxWaveSpeed" + suffix;
 
-  if (vTraceHalo) vTraceHalo->Free();
-  if (pTraceHalo) pTraceHalo->Free();
+  maxWaveSpeedKernel = platform.buildKernel(fileName, kernelName, kernelInfo);
 }
diff --git a/solvers/ins/src/insStep.cpp b/solvers/ins/src/insStep.cpp
index 2f9f999e3..81790b843 100644
--- a/solvers/ins/src/insStep.cpp
+++ b/solvers/ins/src/insStep.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,10 +26,10 @@ SOFTWARE.
 
 #include "ins.hpp"
 
-dfloat ins_t::MaxWaveSpeed(occa::memory& o_U, const dfloat T){
+dfloat ins_t::MaxWaveSpeed(deviceMemory<dfloat>& o_U, const dfloat T){
 
   //Note: if this is on the critical path in the future, we should pre-allocate this
-  occa::memory o_maxSpeed = platform.malloc(mesh.Nelements*sizeof(dfloat));
+  deviceMemory<dfloat> o_maxSpeed = platform.malloc<dfloat>(mesh.Nelements);
 
   maxWaveSpeedKernel(mesh.Nelements,
                      mesh.o_vgeo,
@@ -44,18 +44,17 @@ dfloat ins_t::MaxWaveSpeed(occa::memory& o_U, const dfloat T){
                      o_U,
                      o_maxSpeed);
 
-  const dfloat vmax = platform.linAlg.max(mesh.Nelements, o_maxSpeed, mesh.comm);
+  const dfloat vmax = platform.linAlg().max(mesh.Nelements, o_maxSpeed, mesh.comm);
 
-  o_maxSpeed.free();
   return vmax;
 }
 
 // Inversion of diffusion operator
 //  Solves gamma*U - mu*Laplacian*U = rhs
 //  Afterwards, imposes incompressiblity via pressure problem
-void ins_t::rhs_imex_invg(occa::memory& o_RHS, occa::memory& o_U, const dfloat gamma, const dfloat T){
+void ins_t::rhs_imex_invg(deviceMemory<dfloat>& o_RHS, deviceMemory<dfloat>& o_U, const dfloat gamma, const dfloat T){
 
-  const dfloat dt = timeStepper->GetTimeStep();
+  const dfloat dt = timeStepper.GetTimeStep();
 
   if (pressureIncrement) {
     //use current pressure in velocity RHS
@@ -108,28 +107,27 @@ void ins_t::rhs_imex_invg(occa::memory& o_RHS, occa::memory& o_U, const dfloat g
 }
 
 // Evaluation of rhs f function
-void ins_t::rhs_imex_f(occa::memory& o_U, occa::memory& o_RHS, const dfloat T){
+void ins_t::rhs_imex_f(deviceMemory<dfloat>& o_U, deviceMemory<dfloat>& o_RHS, const dfloat T){
   // RHS = N(U)
   Advection(1.0, o_U, 0.0, o_RHS, T);
 }
 
 // Evolve rhs f function via a sub-timestepper
-void ins_t::rhs_subcycle_f(occa::memory& o_U, occa::memory& o_UHAT,
-                           const dfloat T, const dfloat dt, const dfloat* B,
+void ins_t::rhs_subcycle_f(deviceMemory<dfloat>& o_U, deviceMemory<dfloat>& o_UHAT,
+                           const dfloat T, const dfloat dt, const memory<dfloat> B,
                            const int order, const int shiftIndex, const int maxOrder) {
 
   //subcycle each Lagrangian state qhat by stepping dqhat/dt = F(qhat,t)
+  LIBP_ABORT("Subcycling supports only order 3 interpolation for now.",
+             order>=3);
 
-  if (order>=3)
-    LIBP_ABORT("Subcycling supports only order 3 interpolation for now.")
+  subcycler.order = order;
+  subcycler.maxOrder = maxOrder;
+  subcycler.shiftIndex = shiftIndex;
+  subcycler.T0 = T;
+  subcycler.dt = dt;
 
-  subcycler->order = order;
-  subcycler->maxOrder = maxOrder;
-  subcycler->shiftIndex = shiftIndex;
-  subcycler->T0 = T;
-  subcycler->dt = dt;
-
-  subcycler->o_Uh = o_U; //history
+  subcycler.o_Uh = o_U; //history
 
   //At each iteration of n, we step the partial sum
   // sum_i=n^order B[i]*U(t-i*dt) from t-n*dt to t-(n-1)*dt
@@ -142,13 +140,13 @@ void ins_t::rhs_subcycle_f(occa::memory& o_U, occa::memory& o_UHAT,
   for (int n=order;n>=0;n--) { //for each history state, starting with oldest
 
     //q at t-n*dt
-    occa::memory o_Un = o_U + ((shiftIndex+n)%maxOrder)*N*sizeof(dfloat);
+    deviceMemory<dfloat> o_Un = o_U + ((shiftIndex+n)%maxOrder)*N;
 
     //next scaled partial sum
-    linAlg.axpy(N, B[n+1]/(B[n+1]+bSum), o_Un,
-                   bSum/(B[n+1]+bSum), o_UHAT);
+    platform.linAlg().axpy(N, B[n+1]/(B[n+1]+bSum), o_Un,
+                              bSum/(B[n+1]+bSum), o_UHAT);
     bSum += B[n+1];
 
-    subStepper->Run(o_UHAT, T-n*dt, T-(n-1)*dt);
+    subStepper.Run(subcycler, o_UHAT, T-n*dt, T-(n-1)*dt);
   }
 }
diff --git a/solvers/ins/src/insSubcycle.cpp b/solvers/ins/src/insSubcycle.cpp
index 1fd62171b..b51a2e06c 100644
--- a/solvers/ins/src/insSubcycle.cpp
+++ b/solvers/ins/src/insSubcycle.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -26,19 +26,8 @@ SOFTWARE.
 
 #include "ins.hpp"
 
-subcycler_t::subcycler_t(ins_t& ins):
-  solver_t(ins.platform, ins.settings), mesh(ins.mesh) {
-
-  NVfields = ins.NVfields;
-  nu = ins.nu;
-  cubature = ins.cubature;
-  vTraceHalo = ins.vTraceHalo;
-  advectionVolumeKernel = ins.advectionVolumeKernel;
-  advectionSurfaceKernel = ins.advectionSurfaceKernel;
-}
-
 //evaluate ODE rhs = f(q,t)
-void subcycler_t::rhsf(occa::memory& o_U, occa::memory& o_RHS, const dfloat T){
+void subcycler_t::rhsf(deviceMemory<dfloat>& o_U, deviceMemory<dfloat>& o_RHS, const dfloat T){
 
   //interpolate velocity history for advective field (halo elements first)
   if(mesh.NhaloElements)
@@ -55,7 +44,7 @@ void subcycler_t::rhsf(occa::memory& o_U, occa::memory& o_RHS, const dfloat T){
                            o_Ue);
 
   // extract Ue halo
-  vTraceHalo->ExchangeStart(o_Ue, 1, ogs_dfloat);
+  vTraceHalo.ExchangeStart(o_Ue, 1);
 
   if(mesh.NinternalElements)
     subCycleAdvectionKernel(mesh.NinternalElements,
@@ -71,10 +60,10 @@ void subcycler_t::rhsf(occa::memory& o_U, occa::memory& o_RHS, const dfloat T){
                            o_Ue);
 
   // finish exchange of Ue
-  vTraceHalo->ExchangeFinish(o_Ue, 1, ogs_dfloat);
+  vTraceHalo.ExchangeFinish(o_Ue, 1);
 
   // extract u halo on DEVICE
-  vTraceHalo->ExchangeStart(o_U, 1, ogs_dfloat);
+  vTraceHalo.ExchangeStart(o_U, 1);
 
   if (cubature)
     advectionVolumeKernel(mesh.Nelements,
@@ -95,7 +84,7 @@ void subcycler_t::rhsf(occa::memory& o_U, occa::memory& o_RHS, const dfloat T){
                          o_U,
                          o_RHS);
 
-  vTraceHalo->ExchangeFinish(o_U, 1, ogs_dfloat);
+  vTraceHalo.ExchangeFinish(o_U, 1);
 
   if (cubature)
     advectionSurfaceKernel(mesh.Nelements,
diff --git a/solvers/ins/src/insVelocitySolve.cpp b/solvers/ins/src/insVelocitySolve.cpp
index 69b6124e8..59f2a5c4f 100644
--- a/solvers/ins/src/insVelocitySolve.cpp
+++ b/solvers/ins/src/insVelocitySolve.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,12 +27,13 @@ SOFTWARE.
 #include "ins.hpp"
 
 //  Solves gamma*U - nu*Laplacian*U = rhs
-void ins_t::VelocitySolve(occa::memory& o_U, occa::memory& o_RHS,
+void ins_t::VelocitySolve(deviceMemory<dfloat>& o_U, deviceMemory<dfloat>& o_RHS,
                           const dfloat gamma, const dfloat T) {
 
   // compute RHS = MM*RHS/nu + BCdata
   // and split fields to separate arrays
   velocityRhsKernel(mesh.Nelements,
+                      mesh.o_wJ,
                       mesh.o_vgeo,
                       mesh.o_sgeo,
                       mesh.o_ggeo,
@@ -43,7 +44,7 @@ void ins_t::VelocitySolve(occa::memory& o_U, occa::memory& o_RHS,
                       mesh.o_sM,
                       mesh.o_vmapM,
                       mesh.o_EToB,
-                      o_mapB,
+                      mesh.o_mapB,
                       vTau,
                       T,
                       mesh.o_x,
@@ -63,39 +64,39 @@ void ins_t::VelocitySolve(occa::memory& o_U, occa::memory& o_RHS,
   int maxIter = 5000;
   int verbose = 0;
 
-  uSolver->lambda = gamma/nu;
-  vSolver->lambda = gamma/nu;
-  wSolver->lambda = gamma/nu;
+  uSolver.lambda = gamma/nu;
+  vSolver.lambda = gamma/nu;
+  wSolver.lambda = gamma/nu;
 
   //  Solve lambda*U - Laplacian*U = rhs
   if (vDisc_c0){
     // gather, solve, scatter
-    uSolver->ogsMasked->Gather(o_GrhsU, o_rhsU, ogs_dfloat, ogs_add, ogs_trans);
-    NiterU = uSolver->Solve(*uLinearSolver, o_GUH, o_GrhsU, velTOL, maxIter, verbose);
-    uSolver->ogsMasked->Scatter(o_UH, o_GUH, ogs_dfloat, ogs_add, ogs_notrans);
+    uSolver.ogsMasked.Gather(o_GrhsU, o_rhsU, 1, ogs::Add, ogs::Trans);
+    NiterU = uSolver.Solve(uLinearSolver, o_GUH, o_GrhsU, velTOL, maxIter, verbose);
+    uSolver.ogsMasked.Scatter(o_UH, o_GUH, 1, ogs::NoTrans);
 
-    vSolver->ogsMasked->Gather(o_GrhsV, o_rhsV, ogs_dfloat, ogs_add, ogs_trans);
-    NiterV = vSolver->Solve(*vLinearSolver, o_GVH, o_GrhsV, velTOL, maxIter, verbose);
-    vSolver->ogsMasked->Scatter(o_VH, o_GVH, ogs_dfloat, ogs_add, ogs_notrans);
+    vSolver.ogsMasked.Gather(o_GrhsV, o_rhsV, 1, ogs::Add, ogs::Trans);
+    NiterV = vSolver.Solve(vLinearSolver, o_GVH, o_GrhsV, velTOL, maxIter, verbose);
+    vSolver.ogsMasked.Scatter(o_VH, o_GVH, 1, ogs::NoTrans);
 
     if (mesh.dim==3) {
-      wSolver->ogsMasked->Gather(o_GrhsW, o_rhsW, ogs_dfloat, ogs_add, ogs_trans);
-      NiterW = wSolver->Solve(*wLinearSolver, o_GWH, o_GrhsW, velTOL, maxIter, verbose);
-      wSolver->ogsMasked->Scatter(o_WH, o_GWH, ogs_dfloat, ogs_add, ogs_notrans);
+      wSolver.ogsMasked.Gather(o_GrhsW, o_rhsW, 1, ogs::Add, ogs::Trans);
+      NiterW = wSolver.Solve(wLinearSolver, o_GWH, o_GrhsW, velTOL, maxIter, verbose);
+      wSolver.ogsMasked.Scatter(o_WH, o_GWH, 1, ogs::NoTrans);
     }
 
   } else {
-    NiterU = uSolver->Solve(*uLinearSolver, o_UH, o_rhsU, velTOL, maxIter, verbose);
-    NiterV = vSolver->Solve(*vLinearSolver, o_VH, o_rhsV, velTOL, maxIter, verbose);
+    NiterU = uSolver.Solve(uLinearSolver, o_UH, o_rhsU, velTOL, maxIter, verbose);
+    NiterV = vSolver.Solve(vLinearSolver, o_VH, o_rhsV, velTOL, maxIter, verbose);
     if (mesh.dim==3)
-      NiterW = wSolver->Solve(*wLinearSolver, o_WH, o_rhsW, velTOL, maxIter, verbose);
+      NiterW = wSolver.Solve(wLinearSolver, o_WH, o_rhsW, velTOL, maxIter, verbose);
   }
 
   // merge arrays back, and enter BCs if C0
   velocityBCKernel(mesh.Nelements,
                   mesh.o_sgeo,
                   mesh.o_vmapM,
-                  o_mapB,
+                  mesh.o_mapB,
                   T,
                   mesh.o_x,
                   mesh.o_y,
diff --git a/solvers/lbs/data/lbsGaussian2D.h b/solvers/lbs/data/lbsGaussian2D.h
index 02ab2ee02..6cecc7760 100644
--- a/solvers/lbs/data/lbsGaussian2D.h
+++ b/solvers/lbs/data/lbsGaussian2D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/lbs/data/lbsGaussian3D.h b/solvers/lbs/data/lbsGaussian3D.h
index 88e5ef82e..d10d70434 100644
--- a/solvers/lbs/data/lbsGaussian3D.h
+++ b/solvers/lbs/data/lbsGaussian3D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/lbs/data/lbsUniform2D.h b/solvers/lbs/data/lbsUniform2D.h
index 36b31f7c4..fd2acaab8 100644
--- a/solvers/lbs/data/lbsUniform2D.h
+++ b/solvers/lbs/data/lbsUniform2D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/lbs/data/lbsUniform3D.h b/solvers/lbs/data/lbsUniform3D.h
index fb6c3722a..cef99167b 100644
--- a/solvers/lbs/data/lbsUniform3D.h
+++ b/solvers/lbs/data/lbsUniform3D.h
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/lbs/lbs.hpp b/solvers/lbs/lbs.hpp
index 1cc35411f..e4a29133a 100644
--- a/solvers/lbs/lbs.hpp
+++ b/solvers/lbs/lbs.hpp
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -36,28 +36,30 @@
 
 #define DLBS LIBP_DIR"/solvers/lbs/"
 
+using namespace libp;
+
 class lbsSettings_t: public settings_t {
 public:
-  lbsSettings_t(MPI_Comm& _comm);
+  lbsSettings_t(comm_t& _comm);
   void report();
   void parseFromFile(platformSettings_t& platformSettings,
                      meshSettings_t& meshSettings,
-                     const string filename);
+                     const std::string filename);
 };
 
 class lbs_t: public solver_t {
 public:
-  mesh_t& mesh;
+  mesh_t mesh;
 
   int Nfields;
   int Nmacro;
   int Npmlfields;
   int velModel;
 
-  TimeStepper::timeStepper_t* timeStepper;
+  timeStepper_t timeStepper;
 
-  halo_t* traceHalo;
-  halo_t** multirateTraceHalo;
+  ogs::halo_t traceHalo;
+  memory<ogs::halo_t> multirateTraceHalo;
 
   // dfloat RT, c, tauInv, Ma, Re, nu; // Flow parameters
   dfloat RT, c, tauInv, Re, nu, alpha; // Flow parameters
@@ -65,7 +67,7 @@ class lbs_t: public solver_t {
   // Pml
   int pmlOrder;
   dfloat  sigmaXmax, sigmaYmax, sigmaZmax;
-  dfloat *pmlSigma;
+  memory<dfloat> pmlSigma;
   dfloat pmlAlpha;
 
   // Flag for using cubature integration for sigma terms in pml
@@ -74,57 +76,56 @@ class lbs_t: public solver_t {
   // Flag for semi-analytic timestepping
   int semiAnalytic;
 
-  dfloat *q;
-  occa::memory o_q;
-  
+  memory<dfloat> q;
+  deviceMemory<dfloat> o_q;
+
   // external forcing in velocity space
-  dfloat *F; 
-  occa::memory o_F; 
-  
+  memory<dfloat> F;
+  deviceMemory<dfloat> o_F;
+
   // Macro quantities i.e. density + velocity
-  dfloat *U; 
-  occa::memory o_U; 
+  memory<dfloat> U;
+  deviceMemory<dfloat> o_U;
 
-  dfloat *LBM; 
-  occa::memory o_LBM;
+  memory<dfloat> LBM;
+  deviceMemory<dfloat> o_LBM;
 
-  int *LMAP; 
-  occa::memory o_LMAP; 
+  memory<int> LMAP;
+  deviceMemory<int> o_LMAP;
 
-  occa::memory o_Mq;
+  deviceMemory<dfloat> o_Mq;
 
-  dfloat *Vort, *VortMag;
-  occa::memory o_Vort, o_VortMag;
+  memory<dfloat> Vort, VortMag;
+  deviceMemory<dfloat> o_Vort, o_VortMag;
 
-  occa::memory o_pmlSigma;
+  deviceMemory<dfloat> o_pmlSigma;
 
-  occa::kernel collisionKernel; 
-  occa::kernel forcingKernel; 
-  occa::kernel momentsKernel; 
-  occa::kernel phaseFieldKernel; 
+  kernel_t collisionKernel;
+  kernel_t forcingKernel;
+  kernel_t momentsKernel;
+  kernel_t phaseFieldKernel;
 
-  occa::kernel volumeKernel;
-  occa::kernel surfaceKernel;
-  occa::kernel relaxationKernel;
+  kernel_t volumeKernel;
+  kernel_t surfaceKernel;
+  kernel_t relaxationKernel;
 
-  occa::kernel pmlVolumeKernel;
-  occa::kernel pmlSurfaceKernel;
-  occa::kernel pmlRelaxationKernel;
+  kernel_t pmlVolumeKernel;
+  kernel_t pmlSurfaceKernel;
+  kernel_t pmlRelaxationKernel;
 
-  occa::kernel vorticityKernel;
+  kernel_t vorticityKernel;
 
-  occa::kernel initialConditionKernel;
+  kernel_t initialConditionKernel;
 
-  lbs_t() = delete;
+  lbs_t() = default;
   lbs_t(platform_t &_platform, mesh_t &_mesh,
-	lbsSettings_t& _settings):
-    solver_t(_platform, _settings), mesh(_mesh) {}
-
-  ~lbs_t();
+        lbsSettings_t& _settings) {
+    Setup(_platform, _mesh, _settings);
+  }
 
   //setup
-  static lbs_t& Setup(platform_t& platform, mesh_t& mesh,
-                      lbsSettings_t& settings);
+  void Setup(platform_t& _platform, mesh_t& _mesh,
+             lbsSettings_t& _settings);
 
   void PmlSetup();
 
@@ -132,16 +133,15 @@ class lbs_t: public solver_t {
 
   void Report(dfloat time, int tstep);
 
-  void PlotFields(dfloat* Q, dfloat* V, char *fileName);
+  void PlotFields(memory<dfloat>& Q, memory<dfloat>& V, std::string fileName);
 
   dfloat MaxWaveSpeed();
 
+  void rhsf(deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS, const dfloat T);
 
-  void rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T);
-
-  void rhsVolume(dlong N, occa::memory& o_Q, occa::memory& o_RHS, const dfloat T);
+  void rhsVolume(dlong N, deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS, const dfloat T);
 
-  void rhsSurface(dlong N, occa::memory& o_Q, occa::memory& o_RHS, const dfloat T);
+  void rhsSurface(dlong N, deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS, const dfloat T);
 
   void latticeSetup(); 
 };
diff --git a/solvers/lbs/lbsMain.cpp b/solvers/lbs/lbsMain.cpp
index aa3861168..fc77abf7e 100644
--- a/solvers/lbs/lbsMain.cpp
+++ b/solvers/lbs/lbsMain.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -29,39 +29,40 @@ SOFTWARE.
 int main(int argc, char **argv){
 
   // start up MPI
-  MPI_Init(&argc, &argv);
+  Comm::Init(argc, argv);
 
-  MPI_Comm comm = MPI_COMM_WORLD;
+  LIBP_ABORT("Usage: ./lbsMain setupfile", argc!=2);
 
-  if(argc!=2)
-    LIBP_ABORT(string("Usage: ./lbsMain setupfile"));
+  { /*Scope so everything is destructed before MPI_Finalize */
+    comm_t comm(Comm::World().Dup());
 
-  //create default settings
-  platformSettings_t platformSettings(comm);
-  meshSettings_t meshSettings(comm);
-  lbsSettings_t lbsSettings(comm);
+    //create default settings
+    platformSettings_t platformSettings(comm);
+    meshSettings_t meshSettings(comm);
+    lbsSettings_t lbsSettings(comm);
 
+    //load settings from file
+    lbsSettings.parseFromFile(platformSettings, meshSettings,
+                              argv[1]);
 
-  //load settings from file
-  lbsSettings.parseFromFile(platformSettings, meshSettings,
-                            argv[1]);
+    // set up platform
+    platform_t platform(platformSettings);
 
-  // set up platform
-  platform_t platform(platformSettings);
+    platformSettings.report();
+    meshSettings.report();
+    lbsSettings.report();
 
-  platformSettings.report();
-  meshSettings.report();
-  lbsSettings.report();
+    // set up mesh
+    mesh_t mesh(platform, meshSettings, comm);
 
-  // set up mesh
-  mesh_t& mesh = mesh_t::Setup(platform, meshSettings, comm);
+    // set up lbs solver
+    lbs_t lbs(platform, mesh, lbsSettings);
 
-  // set up lbs solver
-  lbs_t& lbs = lbs_t::Setup(platform, mesh, lbsSettings);
+    // run
+    lbs.Run();
+  }
 
-  // run
-  lbs.Run();
   // close down MPI
-  MPI_Finalize();
+  Comm::Finalize();
   return LIBP_SUCCESS;
 }
diff --git a/solvers/lbs/makefile b/solvers/lbs/makefile
index e5f175365..87f127636 100644
--- a/solvers/lbs/makefile
+++ b/solvers/lbs/makefile
@@ -2,7 +2,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
@@ -77,26 +77,22 @@ include ../../make.top
 endif
 endif
 
-#gslib
-GS_DIR=${LIBP_TPL_DIR}/gslib
-
 #libraries
-LBS_LIBP_LIBS=timeStepper mesh ogs linAlg core
+LBS_LIBP_LIBS=timeStepper mesh parAdogs ogs linAlg core
 
 #includes
 INCLUDES=${LIBP_INCLUDES} \
-	-I.
+					-I.
 #defines
 DEFINES =${LIBP_DEFINES} \
-	-DLIBP_DIR='"${LIBP_DIR}"'
+					-DLIBP_DIR='"${LIBP_DIR}"'
 
 #.cpp compilation flags
-LBS_CXXFLAGS=${LIBP_MPICXXFLAGS} ${DEFINES} ${INCLUDES}
+LBS_CXXFLAGS=${LIBP_CXXFLAGS} ${DEFINES} ${INCLUDES}
 
 #link libraries
 LIBS=-L${LIBP_LIBS_DIR} $(addprefix -l,$(LBS_LIBP_LIBS)) \
-	-L$(GS_DIR)/lib -lgs \
-	${LIBP_LIBS}
+			${LIBP_LIBS}
 
 #link flags
 LFLAGS=${LBS_CXXFLAGS} ${LIBS}
@@ -111,7 +107,7 @@ SRC =$(wildcard src/*.cpp)
 OBJS=$(SRC:.cpp=.o)
 
 .PHONY: all lib libp_libs clean clean-libs \
-clean-kernels realclean help info
+				clean-kernels realclean help info
 
 all: lbsMain
 
@@ -143,10 +139,10 @@ endif
 # rule for .cpp files
 %.o: %.cpp $(DEPS) | libp_libs
 ifneq (,${verbose})
-	$(LIBP_MPICXX) -o $*.o -c $*.cpp $(LBS_CXXFLAGS)
+	$(LIBP_CXX) -o $*.o -c $*.cpp $(LBS_CXXFLAGS)
 else
 	@printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n";
-	@$(LIBP_MPICXX) -o $*.o -c $*.cpp $(LBS_CXXFLAGS)
+	@$(LIBP_CXX) -o $*.o -c $*.cpp $(LBS_CXXFLAGS)
 endif
 
 #cleanup
@@ -157,8 +153,7 @@ clean-libs: clean
 	${MAKE} -C ${LIBP_LIBS_DIR} clean
 
 clean-kernels: clean-libs
-# 	$(shell ${OCCA_DIR}/bin/occa clear all -y)
-	rm -rf ~/.occa/
+	rm -rf ${LIBP_DIR}/.occa/
 
 realclean: clean
 	${MAKE} -C ${LIBP_LIBS_DIR} realclean
diff --git a/solvers/lbs/okl/lbsCollisionHex3D.okl b/solvers/lbs/okl/lbsCollisionHex3D.okl
index cbb9b4aa7..ade97fdab 100644
--- a/solvers/lbs/okl/lbsCollisionHex3D.okl
+++ b/solvers/lbs/okl/lbsCollisionHex3D.okl
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -36,178 +36,178 @@ void equiDist3D(const dfloat ew,  const dfloat ex, const dfloat ey, const dfloat
 
 // Compute collision step, physical velocity and scaled external forcing
 @kernel void lbsCollisionHex3D(const dlong Nelements,
-			       // @restrict const  dlong  *  elementIds,
-			       const dfloat t,
-			       const dfloat dt,
-			       const dfloat gamma, // lambda/dt
-			       const dfloat nu, // 1/Re
-			       @restrict const dfloat * LBM, 
-			       @restrict const dfloat * x, 
-			       @restrict const dfloat * y, 
-			       @restrict const dfloat * z, 
-			       @restrict const dfloat *  F,
-			       @restrict const dfloat *  U, 
-			       @restrict dfloat *  q){
+                               // @restrict const  dlong  *  elementIds,
+                               const dfloat t,
+                               const dfloat dt,
+                               const dfloat gamma, // lambda/dt
+                               const dfloat nu, // 1/Re
+                               @restrict const dfloat * LBM,
+                               @restrict const dfloat * x,
+                               @restrict const dfloat * y,
+                               @restrict const dfloat * z,
+                               @restrict const dfloat *  F,
+                               @restrict const dfloat *  U,
+                               @restrict dfloat *  q){
 
   for(dlong e=0;e<Nelements;++e; @outer(0)){  // for all elements
     for(int k=0;k<p_Nq;++k;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
-	for(int i=0;i<p_Nq;++i;@inner(0)){
+        for(int i=0;i<p_Nq;++i;@inner(0)){
 
-	  const dlong idf  = i + j*p_Nq + k*p_Nq*p_Nq + p_Nfields*p_Np*e;
-	  const dlong idn  = i + j*p_Nq + k*p_Nq*p_Nq + p_Nmacro*p_Np*e;
+          const dlong idf  = i + j*p_Nq + k*p_Nq*p_Nq + p_Nfields*p_Np*e;
+          const dlong idn  = i + j*p_Nq + k*p_Nq*p_Nq + p_Nmacro*p_Np*e;
 
-	  const dfloat rn = U[idn +0*p_Np]; 
-          const dfloat un = U[idn +1*p_Np]; 
-          const dfloat vn = U[idn +2*p_Np]; 
-          const dfloat wn = U[idn +3*p_Np]; 
+          const dfloat rn = U[idn +0*p_Np];
+          const dfloat un = U[idn +1*p_Np];
+          const dfloat vn = U[idn +2*p_Np];
+          const dfloat wn = U[idn +3*p_Np];
 
 #pragma unroll p_Nfields
-	  for(int fld=0; fld<p_Nfields;++fld){
-	    const dfloat ew = LBM[fld + 0*p_Nfields]; 
-            const dfloat ex = LBM[fld + 1*p_Nfields]; 
-            const dfloat ey = LBM[fld + 2*p_Nfields]; 
-            const dfloat ez = LBM[fld + 3*p_Nfields]; 
+          for(int fld=0; fld<p_Nfields;++fld){
+            const dfloat ew = LBM[fld + 0*p_Nfields];
+            const dfloat ex = LBM[fld + 1*p_Nfields];
+            const dfloat ey = LBM[fld + 2*p_Nfields];
+            const dfloat ez = LBM[fld + 3*p_Nfields];
             //
-            const dfloat fn = F[idf+fld*p_Np]; // external forcing 
-            const dfloat qn = q[idf+fld*p_Np]; 
-            
-            dfloat qeq = 0.f; 
+            const dfloat fn = F[idf+fld*p_Np]; // external forcing
+            const dfloat qn = q[idf+fld*p_Np];
+
+            dfloat qeq = 0.f;
             equiDist3D(ew, ex, ey, ez, rn, un, vn, wn, &qeq);
-            
+
             // Compute forcing term using unmodified equilibrium distribution
-            const dfloat qext = fn*qeq*dt;  
-            
+            const dfloat qext = fn*qeq*dt;
+
             // modify equilibrium forcing here
-            qeq -= 0.5*dt*fn; 
-            
-            // collision 
-            q[idf+fld*p_Np] += (qext - 1.f/(gamma + 0.5f)*( qn - qeq)); 
-	  }
-	}
+            qeq -= 0.5*dt*fn;
+
+            // collision
+            q[idf+fld*p_Np] += (qext - 1.f/(gamma + 0.5f)*( qn - qeq));
+          }
+        }
       }
     }
   }
-}   
+}
 
 
 @kernel void lbsForcingHex3D(const dlong Nelements,
-			     // @restrict const  dlong  *  elementIds,
-			     const dfloat t,
-			     const dfloat dt,
-			     const dfloat gamma, // lambda/dt
-			     const dfloat nu, // 1/Re
-			     @restrict const dfloat * LBM, 
-			     @restrict const dfloat * x, 
-			     @restrict const dfloat * y, 
-			     @restrict const dfloat * z, 
-			     @restrict const dfloat * q, 
-			     @restrict dfloat *  F,
-			     @restrict dfloat *  U){
+                             // @restrict const  dlong  *  elementIds,
+                             const dfloat t,
+                             const dfloat dt,
+                             const dfloat gamma, // lambda/dt
+                             const dfloat nu, // 1/Re
+                             @restrict const dfloat * LBM,
+                             @restrict const dfloat * x,
+                             @restrict const dfloat * y,
+                             @restrict const dfloat * z,
+                             @restrict const dfloat * q,
+                             @restrict dfloat *  F,
+                             @restrict dfloat *  U){
+
 
-  
   for(dlong e=0;e<Nelements;++e; @outer(0)){  // for all elements
     for(int k=0;k<p_Nq;++k;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
-	for(int i=0;i<p_Nq;++i;@inner(0)){
-	    
-	  const dlong idf  = i + j*p_Nq + k*p_Nq*p_Nq + p_Nfields*p_Np*e;
-	  const dlong idn  = i + j*p_Nq + k*p_Nq*p_Nq + p_Nmacro*p_Np*e;
-	    		
-	  const dfloat xn = x[i + j*p_Nq + k*p_Nq*p_Nq + e*p_Np]; 
-	  const dfloat yn = y[i + j*p_Nq + k*p_Nq*p_Nq + e*p_Np]; 
-	  const dfloat zn = z[i + j*p_Nq + k*p_Nq*p_Nq + e*p_Np]; 
-
-	  dfloat rn = U[idn +0*p_Np]; 
-          dfloat un = U[idn +1*p_Np]; 
-          dfloat vn = U[idn +2*p_Np]; 
-          dfloat wn = U[idn +3*p_Np]; 
-
-	  // Compute force here!!!!
+        for(int i=0;i<p_Nq;++i;@inner(0)){
+
+          const dlong idf  = i + j*p_Nq + k*p_Nq*p_Nq + p_Nfields*p_Np*e;
+          const dlong idn  = i + j*p_Nq + k*p_Nq*p_Nq + p_Nmacro*p_Np*e;
+
+          const dfloat xn = x[i + j*p_Nq + k*p_Nq*p_Nq + e*p_Np];
+          const dfloat yn = y[i + j*p_Nq + k*p_Nq*p_Nq + e*p_Np];
+          const dfloat zn = z[i + j*p_Nq + k*p_Nq*p_Nq + e*p_Np];
+
+          dfloat rn = U[idn +0*p_Np];
+          dfloat un = U[idn +1*p_Np];
+          dfloat vn = U[idn +2*p_Np];
+          dfloat wn = U[idn +3*p_Np];
+
+          // Compute force here!!!!
           // !!! This is currenlty only force, can be modified
-          dfloat fx = 0.f, fy = 0.f, fz = 0.f; // Use previous rn un vn to compute fx, fy 
-          lbsBodyForce3D(nu, t, xn, yn, zn, rn, un, vn, wn, &fx, &fy, &fz); 
-          
+          dfloat fx = 0.f, fy = 0.f, fz = 0.f; // Use previous rn un vn to compute fx, fy
+          lbsBodyForce3D(nu, t, xn, yn, zn, rn, un, vn, wn, &fx, &fy, &fz);
+
 
-	  // Now update velocity 
-          rn =0.f, un =0.f, vn =0.f, wn=0.f; 
+          // Now update velocity
+          rn =0.f, un =0.f, vn =0.f, wn=0.f;
 #pragma unroll p_Nfields
           for(int fld=0; fld<p_Nfields;++fld){
-            // const dfloat ew = LBM[fld + 0*p_Nfields]; 
-            const dfloat ex = LBM[fld + 1*p_Nfields]; 
-            const dfloat ey = LBM[fld + 2*p_Nfields]; 
-            const dfloat ez = LBM[fld + 3*p_Nfields]; 
-            
-            const dfloat qn = q[idf+fld*p_Np]; 
+            // const dfloat ew = LBM[fld + 0*p_Nfields];
+            const dfloat ex = LBM[fld + 1*p_Nfields];
+            const dfloat ey = LBM[fld + 2*p_Nfields];
+            const dfloat ez = LBM[fld + 3*p_Nfields];
+
+            const dfloat qn = q[idf+fld*p_Np];
             rn  += qn; // density
-            un  += ex*qn; 
-            vn  += ey*qn; 
-            wn  += ez*qn; 
+            un  += ex*qn;
+            vn  += ey*qn;
+            wn  += ez*qn;
           }
 
           // This is currenlty only force, can be modified
           un  = (un + 0.5*fx*dt)/rn;
           vn  = (vn + 0.5*fy*dt)/rn;
           wn  = (wn + 0.5*fz*dt)/rn;
-          
 
-          for(int fld=0; fld<p_Nfields;++fld){         
-            const dfloat ex = LBM[fld + 1*p_Nfields]; 
-            const dfloat ey = LBM[fld + 2*p_Nfields]; 
-            const dfloat ez = LBM[fld + 3*p_Nfields]; 
-            F[idf +fld*p_Np] =  1.f/rn*p_ic2*( (ex-un)*fx + (ey-vn)*fy + (ez-wn)*fz ); 
+
+          for(int fld=0; fld<p_Nfields;++fld){
+            const dfloat ex = LBM[fld + 1*p_Nfields];
+            const dfloat ey = LBM[fld + 2*p_Nfields];
+            const dfloat ez = LBM[fld + 3*p_Nfields];
+            F[idf +fld*p_Np] =  1.f/rn*p_ic2*( (ex-un)*fx + (ey-vn)*fy + (ez-wn)*fz );
           }
 
-          U[idn + 0*p_Np] = rn; 
-          U[idn + 1*p_Np] = un; 
-          U[idn + 2*p_Np] = vn; 
-          U[idn + 3*p_Np] = wn; 
+          U[idn + 0*p_Np] = rn;
+          U[idn + 1*p_Np] = un;
+          U[idn + 2*p_Np] = vn;
+          U[idn + 3*p_Np] = wn;
 
-	}
+        }
       }
     }
-  }   
+  }
 }
 
 // Compute collision step and update velocity field
 @kernel void lbsMomentsHex3D(const dlong Nelements,
-			     // @restrict const  dlong  *  elementIds,
-			     @restrict const dfloat * LBM, 
-			     @restrict const dfloat *  q,
-			     @restrict dfloat *  U){
+                             // @restrict const  dlong  *  elementIds,
+                             @restrict const dfloat * LBM,
+                             @restrict const dfloat *  q,
+                             @restrict dfloat *  U){
 
   for(dlong e=0;e<Nelements;++e; @outer(0)){  // for all elements
     for(int k=0;k<p_Nq;++k;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
-	for(int i=0;i<p_Nq;++i;@inner(0)){
-	  const dlong idf  = i + j*p_Nq + k*p_Nq*p_Nq + p_Nfields*p_Np*e;
-	  const dlong idn  = i + j*p_Nq + k*p_Nq*p_Nq + p_Nmacro*p_Np*e;
+        for(int i=0;i<p_Nq;++i;@inner(0)){
+          const dlong idf  = i + j*p_Nq + k*p_Nq*p_Nq + p_Nfields*p_Np*e;
+          const dlong idn  = i + j*p_Nq + k*p_Nq*p_Nq + p_Nmacro*p_Np*e;
 
-	  dfloat rn = 0.f; 
-	  dfloat un = 0.f; 
-	  dfloat vn = 0.f; 
-	  dfloat wn = 0.f; 
+          dfloat rn = 0.f;
+          dfloat un = 0.f;
+          dfloat vn = 0.f;
+          dfloat wn = 0.f;
 
 #pragma unroll p_Nfields
-	  for(int fld=0; fld<p_Nfields;++fld){
-	    const dfloat qn = q[idf+fld*p_Np]; 
-	    rn  += qn; // density
-	    un  += LBM[fld + 1*p_Nfields]*qn; 
-	    vn  += LBM[fld + 2*p_Nfields]*qn; 
-	    wn  += LBM[fld + 3*p_Nfields]*qn; 
-	  }
-	  un = un/rn; //x-velocity
-	  vn = vn/rn; //y-velocity
-	  wn = wn/rn; //z-velocity
-
-	  // const dlong idn = e*p_Nmacro*p_Np + i + j*p_Nq + k*p_Nq*p_Nq ;
-
-	  U[idn + 0*p_Np] = rn; 
-	  U[idn + 1*p_Np] = un; 
-	  U[idn + 2*p_Np] = vn; 
-	  U[idn + 3*p_Np] = wn; 
-
-	}
+          for(int fld=0; fld<p_Nfields;++fld){
+            const dfloat qn = q[idf+fld*p_Np];
+            rn  += qn; // density
+            un  += LBM[fld + 1*p_Nfields]*qn;
+            vn  += LBM[fld + 2*p_Nfields]*qn;
+            wn  += LBM[fld + 3*p_Nfields]*qn;
+          }
+          un = un/rn; //x-velocity
+          vn = vn/rn; //y-velocity
+          wn = wn/rn; //z-velocity
+
+          // const dlong idn = e*p_Nmacro*p_Np + i + j*p_Nq + k*p_Nq*p_Nq ;
+
+          U[idn + 0*p_Np] = rn;
+          U[idn + 1*p_Np] = un;
+          U[idn + 2*p_Np] = vn;
+          U[idn + 3*p_Np] = wn;
+
+        }
       }
     }
   }
@@ -217,57 +217,57 @@ void equiDist3D(const dfloat ew,  const dfloat ex, const dfloat ey, const dfloat
 
 // Compute phase field from mocaro field
 @kernel void lbsPhaseFieldHex3D(const dlong Nelements,
-				// @restrict const  dlong  *  elementIds,
-				const dfloat t,
-				const dfloat dt,
-				const dfloat gamma, // lambda/dt
-				const dfloat nu, // 1/Re
-				@restrict const dfloat * LBM, 
-				@restrict const dfloat * x, 
-				@restrict const dfloat * y, 
-				@restrict const dfloat * z, 
-				@restrict const dfloat *  U,
-				@restrict dfloat *  q){
+                                // @restrict const  dlong  *  elementIds,
+                                const dfloat t,
+                                const dfloat dt,
+                                const dfloat gamma, // lambda/dt
+                                const dfloat nu, // 1/Re
+                                @restrict const dfloat * LBM,
+                                @restrict const dfloat * x,
+                                @restrict const dfloat * y,
+                                @restrict const dfloat * z,
+                                @restrict const dfloat *  U,
+                                @restrict dfloat *  q){
+
 
-  
   for(dlong e=0;e<Nelements;++e; @outer(0)){  // for all elements
     for(int k=0;k<p_Nq;++k;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
-	for(int i=0;i<p_Nq;++i;@inner(0)){
-	    
-	  const dlong idf  = i + j*p_Nq + k*p_Nq*p_Nq + p_Nfields*p_Np*e;
-	  const dlong idn  = i + j*p_Nq + k*p_Nq*p_Nq + p_Nmacro*p_Np*e;
-	    		
-	  const dfloat xn = x[i + j*p_Nq + k*p_Nq*p_Nq + e*p_Np]; 
-	  const dfloat yn = y[i + j*p_Nq + k*p_Nq*p_Nq + e*p_Np]; 
-	  const dfloat zn = z[i + j*p_Nq + k*p_Nq*p_Nq + e*p_Np]; 
-
-	  const dfloat rn = U[idn +0*p_Np]; 
-          const dfloat un = U[idn +1*p_Np]; 
-          const dfloat vn = U[idn +2*p_Np]; 
-          const dfloat wn = U[idn +3*p_Np]; 
-
-	  // Compute force here!!!!
+        for(int i=0;i<p_Nq;++i;@inner(0)){
+
+          const dlong idf  = i + j*p_Nq + k*p_Nq*p_Nq + p_Nfields*p_Np*e;
+          const dlong idn  = i + j*p_Nq + k*p_Nq*p_Nq + p_Nmacro*p_Np*e;
+
+          const dfloat xn = x[i + j*p_Nq + k*p_Nq*p_Nq + e*p_Np];
+          const dfloat yn = y[i + j*p_Nq + k*p_Nq*p_Nq + e*p_Np];
+          const dfloat zn = z[i + j*p_Nq + k*p_Nq*p_Nq + e*p_Np];
+
+          const dfloat rn = U[idn +0*p_Np];
+          const dfloat un = U[idn +1*p_Np];
+          const dfloat vn = U[idn +2*p_Np];
+          const dfloat wn = U[idn +3*p_Np];
+
+          // Compute force here!!!!
           // !!! This is currenlty only force, can be modified
-          dfloat fx = 0.f, fy = 0.f, fz = 0.f; // Use previous rn un vn to compute fx, fy 
-          lbsBodyForce3D(nu, t, xn, yn, zn, rn, un, vn, wn, &fx, &fy, &fz); 
+          dfloat fx = 0.f, fy = 0.f, fz = 0.f; // Use previous rn un vn to compute fx, fy
+          lbsBodyForce3D(nu, t, xn, yn, zn, rn, un, vn, wn, &fx, &fy, &fz);
 
 #pragma unroll p_Nfields
           for(int fld=0; fld<p_Nfields;++fld){
-            const dfloat ew = LBM[fld + 0*p_Nfields]; 
-            const dfloat ex = LBM[fld + 1*p_Nfields]; 
-            const dfloat ey = LBM[fld + 2*p_Nfields]; 
-            const dfloat ez = LBM[fld + 3*p_Nfields]; 
+            const dfloat ew = LBM[fld + 0*p_Nfields];
+            const dfloat ex = LBM[fld + 1*p_Nfields];
+            const dfloat ey = LBM[fld + 2*p_Nfields];
+            const dfloat ez = LBM[fld + 3*p_Nfields];
 
-            dfloat qeq = 0.f; 
-            equiDist3D(ew, ex, ey, ez, rn, un, vn, wn, &qeq); 
-            const dfloat qext =  1.f/rn*p_ic2*((ex-un)*fx + (ey-vn)*fy + (ez-wn)*fz); 
+            dfloat qeq = 0.f;
+            equiDist3D(ew, ex, ey, ez, rn, un, vn, wn, &qeq);
+            const dfloat qext =  1.f/rn*p_ic2*((ex-un)*fx + (ey-vn)*fy + (ez-wn)*fz);
 
             q[idf + fld*p_Np]  = qeq*(1.f- 0.5f*dt*qext);
 
           }
 
-	}
+        }
       }
     }
   }
diff --git a/solvers/lbs/okl/lbsCollisionQuad2D.okl b/solvers/lbs/okl/lbsCollisionQuad2D.okl
index 5ddfca283..7f84da4c1 100644
--- a/solvers/lbs/okl/lbsCollisionQuad2D.okl
+++ b/solvers/lbs/okl/lbsCollisionQuad2D.okl
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -26,144 +26,144 @@
 
 // Equilibrium Distribution unmodified!!!!
 void equiDist2D(const dfloat w,  const dfloat ex, const dfloat ey,
-		const dfloat r,  const dfloat u,  const dfloat v, dfloat *feq){
+                const dfloat r,  const dfloat u,  const dfloat v, dfloat *feq){
 
-  const dfloat un = (ex*u + ey*v); 
-  *feq = r*w*(1.0f + p_ic2*un + 0.5f*p_ic4*un*un - 0.5f*p_ic2*(u*u + v*v)); 
+  const dfloat un = (ex*u + ey*v);
+  *feq = r*w*(1.0f + p_ic2*un + 0.5f*p_ic4*un*un - 0.5f*p_ic2*(u*u + v*v));
 }
 
 
 // Compute collision step, physical velocity and scaled external forcing
 @kernel void lbsCollisionQuad2D(const dlong Nelements,
-				// @restrict const  dlong  *  elementIds,
-				const dfloat t,
-				const dfloat dt,
-				const dfloat gamma, // lambda/dt
-				const dfloat nu, // 1/Re
-				@restrict const dfloat * LBM, 
-				@restrict const dfloat * x, 
-				@restrict const dfloat * y, 
-				@restrict const dfloat * z, 
-				@restrict const dfloat *  F,
-				@restrict const dfloat *  U, 
-				@restrict dfloat *  q){
+                                // @restrict const  dlong  *  elementIds,
+                                const dfloat t,
+                                const dfloat dt,
+                                const dfloat gamma, // lambda/dt
+                                const dfloat nu, // 1/Re
+                                @restrict const dfloat * LBM,
+                                @restrict const dfloat * x,
+                                @restrict const dfloat * y,
+                                @restrict const dfloat * z,
+                                @restrict const dfloat *  F,
+                                @restrict const dfloat *  U,
+                                @restrict dfloat *  q){
   for(dlong eo=0;eo<Nelements;eo+=p_NblockV;@outer(0)){  // for all elements
 
     for(int es=0;es<p_NblockV;++es;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
-	for(int i=0;i<p_Nq;++i;@inner(0)){
-	  const dlong et = eo+es; // element in block
-	  if(et<Nelements){
-	    // const dlong e    = elementIds[et];
-	    const dlong e    = et;
-	    const dlong idf  = i + j*p_Nq + p_Nfields*p_Np*e;
-	    const dlong idn  = i + j*p_Nq + p_Nmacro*p_Np*e;
-
-	    const dfloat rn = U[idn + 0*p_Np]; 
-	    const dfloat un = U[idn + 1*p_Np]; 
-	    const dfloat vn = U[idn + 2*p_Np]; 
+        for(int i=0;i<p_Nq;++i;@inner(0)){
+          const dlong et = eo+es; // element in block
+          if(et<Nelements){
+            // const dlong e    = elementIds[et];
+            const dlong e    = et;
+            const dlong idf  = i + j*p_Nq + p_Nfields*p_Np*e;
+            const dlong idn  = i + j*p_Nq + p_Nmacro*p_Np*e;
+
+            const dfloat rn = U[idn + 0*p_Np];
+            const dfloat un = U[idn + 1*p_Np];
+            const dfloat vn = U[idn + 2*p_Np];
 
 #pragma unroll p_Nfields
-	    for(int fld=0; fld<p_Nfields;++fld){
-	      const dfloat ew = LBM[fld + 0*p_Nfields]; 
-	      const dfloat ex = LBM[fld + 1*p_Nfields]; 
-	      const dfloat ey = LBM[fld + 2*p_Nfields]; 
-	      //
-	      const dfloat fn = F[idf+fld*p_Np]; // external forcing 
-	      const dfloat qn = q[idf+fld*p_Np]; 
-
-	      dfloat qeq = 0.f; 
-	      equiDist2D(ew, ex, ey, rn, un, vn, &qeq);
-
-	      // Compute forcing term using unmodified equilibrium distribution
-	      const dfloat qext = fn*qeq*dt;  
-	      // modify equilibrium forcing here
-	      qeq -= 0.5*dt*fn;   
-	      // collision 
-	      q[idf+fld*p_Np] += (qext - 1.f/(gamma + 0.5f)*( qn - qeq)); 
-	    }
-	  }
-	}
+            for(int fld=0; fld<p_Nfields;++fld){
+              const dfloat ew = LBM[fld + 0*p_Nfields];
+              const dfloat ex = LBM[fld + 1*p_Nfields];
+              const dfloat ey = LBM[fld + 2*p_Nfields];
+              //
+              const dfloat fn = F[idf+fld*p_Np]; // external forcing
+              const dfloat qn = q[idf+fld*p_Np];
+
+              dfloat qeq = 0.f;
+              equiDist2D(ew, ex, ey, rn, un, vn, &qeq);
+
+              // Compute forcing term using unmodified equilibrium distribution
+              const dfloat qext = fn*qeq*dt;
+              // modify equilibrium forcing here
+              qeq -= 0.5*dt*fn;
+              // collision
+              q[idf+fld*p_Np] += (qext - 1.f/(gamma + 0.5f)*( qn - qeq));
+            }
+          }
+        }
       }
-    }   
+    }
   }
 }
 
 
 
-// Compute physical velocity and 
-// scaled external forcing i.e. 
+// Compute physical velocity and
+// scaled external forcing i.e.
 // (v-u)\dot G /(rho\timesc^2)
 
 @kernel void lbsForcingQuad2D(const dlong Nelements,
-			     // @restrict const  dlong  *  elementIds,
-			     const dfloat t,
-			     const dfloat dt,
-			     const dfloat gamma, // lambda/dt
-			     const dfloat nu, // 1/Re
-			     @restrict const dfloat * LBM, 
-			     @restrict const dfloat * x, 
-			     @restrict const dfloat * y, 
-			     @restrict const dfloat * z, 
-			     @restrict const dfloat * q, 
-			     @restrict dfloat *  F,
-			     @restrict dfloat *  U){
+                             // @restrict const  dlong  *  elementIds,
+                             const dfloat t,
+                             const dfloat dt,
+                             const dfloat gamma, // lambda/dt
+                             const dfloat nu, // 1/Re
+                             @restrict const dfloat * LBM,
+                             @restrict const dfloat * x,
+                             @restrict const dfloat * y,
+                             @restrict const dfloat * z,
+                             @restrict const dfloat * q,
+                             @restrict dfloat *  F,
+                             @restrict dfloat *  U){
 
   for(dlong eo=0;eo<Nelements;eo+=p_NblockV;@outer(0)){  // for all elements
 
     for(int es=0;es<p_NblockV;++es;@inner(2)){// for all elements in block
       for(int j=0;j<p_Nq;++j;@inner(1)){
-	for(int i=0;i<p_Nq;++i;@inner(0)){
-	  const dlong et = eo+es; // element in block
-	  if(et<Nelements){
-	    // const dlong e   = elementIds[et];
-	    const dlong e    = et;
-	    const dlong idf  = i + j*p_Nq + p_Nfields*p_Np*e;
-	    const dlong idn  = i + j*p_Nq + p_Nmacro*p_Np*e;
-
-	    const dfloat xn = x[i + j*p_Nq + e*p_Np]; 
-	    const dfloat yn = y[i + j*p_Nq + e*p_Np]; 
-
-	    // old velocities
-	    dfloat rn =  U[idn + 0*p_Np]; 
-	    dfloat un =  U[idn + 1*p_Np]; 
-	    dfloat vn =  U[idn + 2*p_Np]; 
-
-	    // !!! This is currenlty only force, can be modified
-	    dfloat fx = 0.f, fy = 0.f; // Use previous rn un vn to compute fx, fy 
-	    lbsBodyForce2D(nu, t, xn, yn, rn, un, vn, &fx, &fy); 
-
-	    // Now update velocity 
-	    rn =0.f, un =0.f, vn =0.f; 
+        for(int i=0;i<p_Nq;++i;@inner(0)){
+          const dlong et = eo+es; // element in block
+          if(et<Nelements){
+            // const dlong e   = elementIds[et];
+            const dlong e    = et;
+            const dlong idf  = i + j*p_Nq + p_Nfields*p_Np*e;
+            const dlong idn  = i + j*p_Nq + p_Nmacro*p_Np*e;
+
+            const dfloat xn = x[i + j*p_Nq + e*p_Np];
+            const dfloat yn = y[i + j*p_Nq + e*p_Np];
+
+            // old velocities
+            dfloat rn =  U[idn + 0*p_Np];
+            dfloat un =  U[idn + 1*p_Np];
+            dfloat vn =  U[idn + 2*p_Np];
+
+            // !!! This is currenlty only force, can be modified
+            dfloat fx = 0.f, fy = 0.f; // Use previous rn un vn to compute fx, fy
+            lbsBodyForce2D(nu, t, xn, yn, rn, un, vn, &fx, &fy);
+
+            // Now update velocity
+            rn =0.f, un =0.f, vn =0.f;
 #pragma unroll p_Nfields
-	    for(int fld=0; fld<p_Nfields;++fld){
-	      const dfloat ex = LBM[fld + 1*p_Nfields]; 
-	      const dfloat ey = LBM[fld + 2*p_Nfields]; 
-	      const dfloat qn = q[idf+fld*p_Np]; 
-	      rn  += qn; // density
-	      un  += ex*qn; 
-	      vn  += ey*qn; 
-	    }
-
-	    // !!! This is currenlty only body force, can be modified
-	    un  = (un + 0.5*fx*dt)/rn;
-	    vn  = (vn + 0.5*fy*dt)/rn;
-
-
-	    for(int fld=0; fld<p_Nfields;++fld){         
-	      const dfloat ex = LBM[fld + 1*p_Nfields]; 
-	      const dfloat ey = LBM[fld + 2*p_Nfields]; 
-	      F[idf +fld*p_Np] =  1.f/rn*p_ic2*( (ex-un)*fx + (ey-vn)*fy ); 
-	    }
-
-	    U[idn + 0*p_Np] = rn; 
-	    U[idn + 1*p_Np] = un; 
-	    U[idn + 2*p_Np] = vn; 
-
-	  }
-	}
+            for(int fld=0; fld<p_Nfields;++fld){
+              const dfloat ex = LBM[fld + 1*p_Nfields];
+              const dfloat ey = LBM[fld + 2*p_Nfields];
+              const dfloat qn = q[idf+fld*p_Np];
+              rn  += qn; // density
+              un  += ex*qn;
+              vn  += ey*qn;
+            }
+
+            // !!! This is currenlty only body force, can be modified
+            un  = (un + 0.5*fx*dt)/rn;
+            vn  = (vn + 0.5*fy*dt)/rn;
+
+
+            for(int fld=0; fld<p_Nfields;++fld){
+              const dfloat ex = LBM[fld + 1*p_Nfields];
+              const dfloat ey = LBM[fld + 2*p_Nfields];
+              F[idf +fld*p_Np] =  1.f/rn*p_ic2*( (ex-un)*fx + (ey-vn)*fy );
+            }
+
+            U[idn + 0*p_Np] = rn;
+            U[idn + 1*p_Np] = un;
+            U[idn + 2*p_Np] = vn;
+
+          }
+        }
       }
-    }   
+    }
   }
 }
 
@@ -171,43 +171,43 @@ void equiDist2D(const dfloat w,  const dfloat ex, const dfloat ey,
 
 // Compute collision step and update velocity field
 @kernel void lbsMomentsQuad2D(const dlong Nelements,
-			     // @restrict const  dlong  *  elementIds,
-			     @restrict const dfloat * LBM, 
-			     @restrict const dfloat *  q,
-			     @restrict dfloat *  U){
+                             // @restrict const  dlong  *  elementIds,
+                             @restrict const dfloat * LBM,
+                             @restrict const dfloat *  q,
+                             @restrict dfloat *  U){
 
   for(dlong eo=0;eo<Nelements;eo+=p_NblockV;@outer(0)){  // for all elements
 
     for(int es=0;es<p_NblockV;++es;@inner(2)){// for all elements in block
       for(int j=0;j<p_Nq;++j;@inner(1)){
-	for(int i=0;i<p_Nq;++i;@inner(0)){
-	  const dlong et = eo+es; // element in block
-	  if(et<Nelements){
-	    // const dlong e = elementIds[et];
-	    const dlong e    = et;
-	    const dlong idf  = i + j*p_Nq + p_Nfields*p_Np*e;
-	    const dlong idn  = i + j*p_Nq + p_Nmacro*p_Np*e;
-
-	    dfloat rn = 0.f; 
-	    dfloat un = 0.f; 
-	    dfloat vn = 0.f; 
+        for(int i=0;i<p_Nq;++i;@inner(0)){
+          const dlong et = eo+es; // element in block
+          if(et<Nelements){
+            // const dlong e = elementIds[et];
+            const dlong e    = et;
+            const dlong idf  = i + j*p_Nq + p_Nfields*p_Np*e;
+            const dlong idn  = i + j*p_Nq + p_Nmacro*p_Np*e;
+
+            dfloat rn = 0.f;
+            dfloat un = 0.f;
+            dfloat vn = 0.f;
 
 #pragma unroll p_Nfields
-	    for(int fld=0; fld<p_Nfields;++fld){
-	      const dfloat qn = q[idf+fld*p_Np]; 
-	      rn  += qn; // density
-	      un  += LBM[fld + 1*p_Nfields]*qn; 
-	      vn  += LBM[fld + 2*p_Nfields]*qn; 
-	    }
-	    un = un/rn; //x-velocity
-	    vn = vn/rn; //y-velocity
-
-	    U[idn + 0*p_Np] = rn; 
-	    U[idn + 1*p_Np] = un; 
-	    U[idn + 2*p_Np] = vn; 
-
-	  }
-	}
+            for(int fld=0; fld<p_Nfields;++fld){
+              const dfloat qn = q[idf+fld*p_Np];
+              rn  += qn; // density
+              un  += LBM[fld + 1*p_Nfields]*qn;
+              vn  += LBM[fld + 2*p_Nfields]*qn;
+            }
+            un = un/rn; //x-velocity
+            vn = vn/rn; //y-velocity
+
+            U[idn + 0*p_Np] = rn;
+            U[idn + 1*p_Np] = un;
+            U[idn + 2*p_Np] = vn;
+
+          }
+        }
       }
     }
   }
@@ -217,58 +217,58 @@ void equiDist2D(const dfloat w,  const dfloat ex, const dfloat ey,
 
 // Compute phase field from mocaro field
 @kernel void lbsPhaseFieldQuad2D(const dlong Nelements,
-				// @restrict const  dlong  *  elementIds,
-				const dfloat t,
-				const dfloat dt,
-				const dfloat gamma, // lambda/dt
-				const dfloat nu, // 1/Re
-				@restrict const dfloat * LBM, 
-				@restrict const dfloat * x, 
-				@restrict const dfloat * y, 
-				@restrict const dfloat * z, 
-				@restrict const dfloat *  U,
-				@restrict dfloat *  q){
+                                // @restrict const  dlong  *  elementIds,
+                                const dfloat t,
+                                const dfloat dt,
+                                const dfloat gamma, // lambda/dt
+                                const dfloat nu, // 1/Re
+                                @restrict const dfloat * LBM,
+                                @restrict const dfloat * x,
+                                @restrict const dfloat * y,
+                                @restrict const dfloat * z,
+                                @restrict const dfloat *  U,
+                                @restrict dfloat *  q){
 
   for(dlong eo=0;eo<Nelements;eo+=p_NblockV;@outer(0)){  // for all elements
-    // @exclusive dlong e; 
+    // @exclusive dlong e;
     for(int es=0;es<p_NblockV;++es;@inner(2)){// for all elements in block
       for(int j=0;j<p_Nq;++j;@inner(1)){
-	for(int i=0;i<p_Nq;++i;@inner(0)){
-	  const dlong et = eo+es; // element in block
-	  if(et<Nelements){
-	    // const dlong e = elementIds[et];
-	    const dlong e    = et;
-	    const dlong idf  = i + j*p_Nq + p_Nfields*p_Np*e;
-	    const dlong idn  = i + j*p_Nq + p_Nmacro*p_Np*e;
+        for(int i=0;i<p_Nq;++i;@inner(0)){
+          const dlong et = eo+es; // element in block
+          if(et<Nelements){
+            // const dlong e = elementIds[et];
+            const dlong e    = et;
+            const dlong idf  = i + j*p_Nq + p_Nfields*p_Np*e;
+            const dlong idn  = i + j*p_Nq + p_Nmacro*p_Np*e;
 
 
-	    const dfloat rn = U[idn + 0*p_Np]; 
-	    const dfloat un = U[idn + 1*p_Np]; 
-	    const dfloat vn = U[idn + 2*p_Np]; 
+            const dfloat rn = U[idn + 0*p_Np];
+            const dfloat un = U[idn + 1*p_Np];
+            const dfloat vn = U[idn + 2*p_Np];
 
-	    const dfloat xn = x[i + j*p_Nq + e*p_Np]; 
-	    const dfloat yn = y[i + j*p_Nq + e*p_Np]; 
+            const dfloat xn = x[i + j*p_Nq + e*p_Np];
+            const dfloat yn = y[i + j*p_Nq + e*p_Np];
 
-	    dfloat fx = 0.f, fy = 0.f;
-	    lbsBodyForce2D(nu, t, xn, yn, rn, un, vn, &fx, &fy); 
+            dfloat fx = 0.f, fy = 0.f;
+            lbsBodyForce2D(nu, t, xn, yn, rn, un, vn, &fx, &fy);
 
 
 #pragma unroll p_Nfields
-	    for(int fld=0; fld<p_Nfields;++fld){
-	      const dfloat ew = LBM[fld + 0*p_Nfields]; 
-	      const dfloat ex = LBM[fld + 1*p_Nfields]; 
-	      const dfloat ey = LBM[fld + 2*p_Nfields]; 
+            for(int fld=0; fld<p_Nfields;++fld){
+              const dfloat ew = LBM[fld + 0*p_Nfields];
+              const dfloat ex = LBM[fld + 1*p_Nfields];
+              const dfloat ey = LBM[fld + 2*p_Nfields];
 
-	      dfloat qeq = 0.f; 
-	      equiDist2D(ew, ex, ey, rn, un, vn, &qeq); 
-	      const dfloat qext =  1.f/rn*p_ic2*((ex-un)*fx + (ey-vn)*fy); 
+              dfloat qeq = 0.f;
+              equiDist2D(ew, ex, ey, rn, un, vn, &qeq);
+              const dfloat qext =  1.f/rn*p_ic2*((ex-un)*fx + (ey-vn)*fy);
 
-	      q[idf + fld*p_Np]  = qeq*(1.f- 0.5f*dt*qext);
+              q[idf + fld*p_Np]  = qeq*(1.f- 0.5f*dt*qext);
 
-	    }
+            }
 
-	  }
-	}
+          }
+        }
       }
     }
   }
diff --git a/solvers/lbs/okl/lbsCollisionTet3D.okl b/solvers/lbs/okl/lbsCollisionTet3D.okl
index 18ca53855..92d72cb5d 100644
--- a/solvers/lbs/okl/lbsCollisionTet3D.okl
+++ b/solvers/lbs/okl/lbsCollisionTet3D.okl
@@ -35,21 +35,21 @@ void equiDist3D(const dfloat ew,  const dfloat ex, const dfloat ey, const dfloat
 
 // Compute collision step, physical velocity and scaled external forcing
 @kernel void lbsCollisionTet3D(const dlong Nelements,
-			       // @restrict const  dlong  *  elementIds,
-			       const dfloat t,
-			       const dfloat dt,
-			       const dfloat gamma, // lambda/dt
-			       const dfloat nu, // 1/Re
-			       @restrict const dfloat * LBM, 
-			       @restrict const dfloat * x, 
-			       @restrict const dfloat * y, 
-			       @restrict const dfloat * z, 
-			       @restrict const dfloat *  F,
-			       @restrict const dfloat *  U, 
-			       @restrict dfloat *  q){
+                               // @restrict const  dlong  *  elementIds,
+                               const dfloat t,
+                               const dfloat dt,
+                               const dfloat gamma, // lambda/dt
+                               const dfloat nu, // 1/Re
+                               @restrict const dfloat * LBM,
+                               @restrict const dfloat * x,
+                               @restrict const dfloat * y,
+                               @restrict const dfloat * z,
+                               @restrict const dfloat *  F,
+                               @restrict const dfloat *  U,
+                               @restrict dfloat *  q){
   for(dlong eo=0;eo<Nelements;eo+=p_NblockV;@outer(0)){  // for all elements
-    // @exclusive dlong e; 
-    // @exclusive dfloat r_u, r_v,r_r; 
+    // @exclusive dlong e;
+    // @exclusive dfloat r_u, r_v,r_r;
 
     for(int es=0;es<p_NblockV;++es;@inner(1)){// for all elements in block
       for(int n=0;n<p_Np;++n;@inner(0)){     // for all nodes in this element
@@ -60,58 +60,58 @@ void equiDist3D(const dfloat ew,  const dfloat ex, const dfloat ey, const dfloat
           const dlong idf  = e*p_Nfields*p_Np + n;
           const dlong idn  = e*p_Nmacro*p_Np  + n;
 
-          const dfloat rn = U[idn +0*p_Np]; 
-          const dfloat un = U[idn +1*p_Np]; 
-          const dfloat vn = U[idn +2*p_Np]; 
-          const dfloat wn = U[idn +3*p_Np]; 
+          const dfloat rn = U[idn +0*p_Np];
+          const dfloat un = U[idn +1*p_Np];
+          const dfloat vn = U[idn +2*p_Np];
+          const dfloat wn = U[idn +3*p_Np];
 
 #pragma unroll p_Nfields
           for(int fld=0; fld<p_Nfields;++fld){
-            const dfloat ew = LBM[fld + 0*p_Nfields]; 
-            const dfloat ex = LBM[fld + 1*p_Nfields]; 
-            const dfloat ey = LBM[fld + 2*p_Nfields]; 
-            const dfloat ez = LBM[fld + 3*p_Nfields]; 
+            const dfloat ew = LBM[fld + 0*p_Nfields];
+            const dfloat ex = LBM[fld + 1*p_Nfields];
+            const dfloat ey = LBM[fld + 2*p_Nfields];
+            const dfloat ez = LBM[fld + 3*p_Nfields];
             //
-            const dfloat fn = F[idf+fld*p_Np]; // external forcing 
-            const dfloat qn = q[idf+fld*p_Np]; 
-            
-            dfloat qeq = 0.f; 
+            const dfloat fn = F[idf+fld*p_Np]; // external forcing
+            const dfloat qn = q[idf+fld*p_Np];
+
+            dfloat qeq = 0.f;
             equiDist3D(ew, ex, ey, ez, rn, un, vn, wn, &qeq);
-            
+
             // Compute forcing term using unmodified equilibrium distribution
-            const dfloat qext = fn*qeq*dt;  
-            
+            const dfloat qext = fn*qeq*dt;
+
             // modify equilibrium forcing here
-            qeq -= 0.5*dt*fn; 
-            
-            // collision 
-            q[idf+fld*p_Np] += (qext - 1.f/(gamma + 0.5f)*( qn - qeq)); 
+            qeq -= 0.5*dt*fn;
+
+            // collision
+            q[idf+fld*p_Np] += (qext - 1.f/(gamma + 0.5f)*( qn - qeq));
           }
         }
       }
     }
-  }   
+  }
 }
 
 
 
-// Compute physical velocity and 
-// scaled external forcing i.e. 
+// Compute physical velocity and
+// scaled external forcing i.e.
 // (v-u)\dot G /(rho\timesc^2)
 
 @kernel void lbsForcingTet3D(const dlong Nelements,
-			     // @restrict const  dlong  *  elementIds,
-			     const dfloat t,
-			     const dfloat dt,
-			     const dfloat gamma, // lambda/dt
-			     const dfloat nu, // 1/Re
-			     @restrict const dfloat * LBM, 
-			     @restrict const dfloat * x, 
-			     @restrict const dfloat * y, 
-			     @restrict const dfloat * z, 
-			     @restrict const dfloat * q, 
-			     @restrict dfloat *  F,
-			     @restrict dfloat *  U){
+                             // @restrict const  dlong  *  elementIds,
+                             const dfloat t,
+                             const dfloat dt,
+                             const dfloat gamma, // lambda/dt
+                             const dfloat nu, // 1/Re
+                             @restrict const dfloat * LBM,
+                             @restrict const dfloat * x,
+                             @restrict const dfloat * y,
+                             @restrict const dfloat * z,
+                             @restrict const dfloat * q,
+                             @restrict dfloat *  F,
+                             @restrict dfloat *  U){
 
   for(dlong eo=0;eo<Nelements;eo+=p_NblockV;@outer(0)){  // for all elements
 
@@ -124,69 +124,69 @@ void equiDist3D(const dfloat ew,  const dfloat ex, const dfloat ey, const dfloat
           const dlong idf = e*p_Nfields*p_Np + n;
           const dlong idn = e*p_Nmacro*p_Np  + n;
 
-          const dfloat xn = x[e*p_Np + n]; 
-          const dfloat yn = y[e*p_Np + n]; 
-          const dfloat zn = z[e*p_Np + n]; 
+          const dfloat xn = x[e*p_Np + n];
+          const dfloat yn = y[e*p_Np + n];
+          const dfloat zn = z[e*p_Np + n];
 
           // old velocities
-          dfloat rn =  U[idn + 0*p_Np]; 
-          dfloat un =  U[idn + 1*p_Np]; 
-          dfloat vn =  U[idn + 2*p_Np]; 
-          dfloat wn =  U[idn + 3*p_Np]; 
+          dfloat rn =  U[idn + 0*p_Np];
+          dfloat un =  U[idn + 1*p_Np];
+          dfloat vn =  U[idn + 2*p_Np];
+          dfloat wn =  U[idn + 3*p_Np];
 
           // Compute force here!!!!
           // !!! This is currenlty only force, can be modified
-          dfloat fx = 0.f, fy = 0.f, fz = 0.f; // Use previous rn un vn to compute fx, fy 
-          lbsBodyForce3D(nu, t, xn, yn, zn, rn, un, vn, wn, &fx, &fy, &fz); 
-          
-          // Now update velocity 
-          rn =0.f, un =0.f, vn =0.f, wn=0.f; 
+          dfloat fx = 0.f, fy = 0.f, fz = 0.f; // Use previous rn un vn to compute fx, fy
+          lbsBodyForce3D(nu, t, xn, yn, zn, rn, un, vn, wn, &fx, &fy, &fz);
+
+          // Now update velocity
+          rn =0.f, un =0.f, vn =0.f, wn=0.f;
 #pragma unroll p_Nfields
           for(int fld=0; fld<p_Nfields;++fld){
-            // const dfloat ew = LBM[fld + 0*p_Nfields]; 
-            const dfloat ex = LBM[fld + 1*p_Nfields]; 
-            const dfloat ey = LBM[fld + 2*p_Nfields]; 
-            const dfloat ez = LBM[fld + 3*p_Nfields]; 
-            
-            const dfloat qn = q[idf+fld*p_Np]; 
+            // const dfloat ew = LBM[fld + 0*p_Nfields];
+            const dfloat ex = LBM[fld + 1*p_Nfields];
+            const dfloat ey = LBM[fld + 2*p_Nfields];
+            const dfloat ez = LBM[fld + 3*p_Nfields];
+
+            const dfloat qn = q[idf+fld*p_Np];
             rn  += qn; // density
-            un  += ex*qn; 
-            vn  += ey*qn; 
-            wn  += ez*qn; 
+            un  += ex*qn;
+            vn  += ey*qn;
+            wn  += ez*qn;
           }
 
           // This is currenlty only force, can be modified
           un  = (un + 0.5*fx*dt)/rn;
           vn  = (vn + 0.5*fy*dt)/rn;
           wn  = (wn + 0.5*fz*dt)/rn;
-          
 
-          for(int fld=0; fld<p_Nfields;++fld){         
-            const dfloat ex = LBM[fld + 1*p_Nfields]; 
-            const dfloat ey = LBM[fld + 2*p_Nfields]; 
-            const dfloat ez = LBM[fld + 3*p_Nfields]; 
-            F[idf +fld*p_Np] =  1.f/rn*p_ic2*( (ex-un)*fx + (ey-vn)*fy + (ez-wn)*fz ); 
+
+          for(int fld=0; fld<p_Nfields;++fld){
+            const dfloat ex = LBM[fld + 1*p_Nfields];
+            const dfloat ey = LBM[fld + 2*p_Nfields];
+            const dfloat ez = LBM[fld + 3*p_Nfields];
+            F[idf +fld*p_Np] =  1.f/rn*p_ic2*( (ex-un)*fx + (ey-vn)*fy + (ez-wn)*fz );
           }
 
-          U[idn + 0*p_Np] = rn; 
-          U[idn + 1*p_Np] = un; 
-          U[idn + 2*p_Np] = vn; 
-          U[idn + 3*p_Np] = wn; 
+          U[idn + 0*p_Np] = rn;
+          U[idn + 1*p_Np] = un;
+          U[idn + 2*p_Np] = vn;
+          U[idn + 3*p_Np] = wn;
 
         }
       }
     }
-  }   
+  }
 }
 
 
 
 // Compute collision step and update velocity field
 @kernel void lbsMomentsTet3D(const dlong Nelements,
-			     // @restrict const  dlong  *  elementIds,
-			     @restrict const dfloat * LBM, 
-			     @restrict const dfloat *  q,
-			     @restrict dfloat *  U){
+                             // @restrict const  dlong  *  elementIds,
+                             @restrict const dfloat * LBM,
+                             @restrict const dfloat *  q,
+                             @restrict dfloat *  U){
 
   for(dlong eo=0;eo<Nelements;eo+=p_NblockV;@outer(0)){  // for all elements
 
@@ -198,18 +198,18 @@ void equiDist3D(const dfloat ew,  const dfloat ex, const dfloat ey, const dfloat
           const dlong e = et;
           const dlong id = e*p_Nfields*p_Np + n;
 
-          dfloat rn = 0.f; 
-          dfloat un = 0.f; 
-          dfloat vn = 0.f; 
-          dfloat wn = 0.f; 
+          dfloat rn = 0.f;
+          dfloat un = 0.f;
+          dfloat vn = 0.f;
+          dfloat wn = 0.f;
 
 #pragma unroll p_Nfields
           for(int fld=0; fld<p_Nfields;++fld){
-            const dfloat qn = q[id+fld*p_Np]; 
+            const dfloat qn = q[id+fld*p_Np];
             rn  += qn; // density
-            un  += LBM[fld + 1*p_Nfields]*qn; 
-            vn  += LBM[fld + 2*p_Nfields]*qn; 
-            wn  += LBM[fld + 3*p_Nfields]*qn; 
+            un  += LBM[fld + 1*p_Nfields]*qn;
+            vn  += LBM[fld + 2*p_Nfields]*qn;
+            wn  += LBM[fld + 3*p_Nfields]*qn;
           }
           un = un/rn; //x-velocity
           vn = vn/rn; //y-velocity
@@ -217,10 +217,10 @@ void equiDist3D(const dfloat ew,  const dfloat ex, const dfloat ey, const dfloat
 
           const dlong idn = e*p_Nmacro*p_Np + n;
 
-          U[idn + 0*p_Np] = rn; 
-          U[idn + 1*p_Np] = un; 
-          U[idn + 2*p_Np] = vn; 
-          U[idn + 3*p_Np] = wn; 
+          U[idn + 0*p_Np] = rn;
+          U[idn + 1*p_Np] = un;
+          U[idn + 2*p_Np] = vn;
+          U[idn + 3*p_Np] = wn;
         }
       }
     }
@@ -231,40 +231,40 @@ void equiDist3D(const dfloat ew,  const dfloat ex, const dfloat ey, const dfloat
 
 // Compute phase field from mocaro field
 @kernel void lbsPhaseFieldTet3D(const dlong Nelements,
-				// @restrict const  dlong  *  elementIds,
-				const dfloat t,
-				const dfloat dt,
-				const dfloat gamma, // lambda/dt
-				const dfloat nu, // 1/Re
-				@restrict const dfloat * LBM, 
-				@restrict const dfloat * x, 
-				@restrict const dfloat * y, 
-				@restrict const dfloat * z, 
-				@restrict const dfloat *  U,
-				@restrict dfloat *  q){
+                                // @restrict const  dlong  *  elementIds,
+                                const dfloat t,
+                                const dfloat dt,
+                                const dfloat gamma, // lambda/dt
+                                const dfloat nu, // 1/Re
+                                @restrict const dfloat * LBM,
+                                @restrict const dfloat * x,
+                                @restrict const dfloat * y,
+                                @restrict const dfloat * z,
+                                @restrict const dfloat *  U,
+                                @restrict dfloat *  q){
 
   for(dlong eo=0;eo<Nelements;eo+=p_NblockV;@outer(0)){  // for all elements
-    // @exclusive dlong e; 
+    // @exclusive dlong e;
     for(int es=0;es<p_NblockV;++es;@inner(1)){// for all elements in block
       for(int n=0;n<p_Np;++n;@inner(0)){     // for all nodes in this element
         const dlong et = eo+es; // element in block
         if(et<Nelements){
-	  // const dlong e = elementIds[et];
-	  const dlong e = et;
+          // const dlong e = elementIds[et];
+          const dlong e = et;
 
-	  const dlong idf = e*p_Nfields*p_Np + n;
-	  const dlong idn = e*p_Nmacro*p_Np + n;
+          const dlong idf = e*p_Nfields*p_Np + n;
+          const dlong idn = e*p_Nmacro*p_Np + n;
 
-          const dfloat xn = x[n + e*p_Np]; 
-          const dfloat yn = y[n + e*p_Np]; 
-          const dfloat zn = z[n + e*p_Np]; 
+          const dfloat xn = x[n + e*p_Np];
+          const dfloat yn = y[n + e*p_Np];
+          const dfloat zn = z[n + e*p_Np];
 
-          const dfloat rn = U[idn + 0*p_Np]; 
-          const dfloat un = U[idn + 1*p_Np]; 
-          const dfloat vn = U[idn + 2*p_Np]; 
-          const dfloat wn = U[idn + 3*p_Np]; 
+          const dfloat rn = U[idn + 0*p_Np];
+          const dfloat un = U[idn + 1*p_Np];
+          const dfloat vn = U[idn + 2*p_Np];
+          const dfloat wn = U[idn + 3*p_Np];
 
-	  dfloat fx = 0.f, fy = 0.f, fz =0.f;
+          dfloat fx = 0.f, fy = 0.f, fz =0.f;
           lbsBodyForce3D(nu, t, xn, yn, zn, rn, un, vn, wn, &fx, &fy, &fz); 
 
 
diff --git a/solvers/lbs/okl/lbsCollisionTri2D.okl b/solvers/lbs/okl/lbsCollisionTri2D.okl
index 1541f9981..74c3e7f03 100644
--- a/solvers/lbs/okl/lbsCollisionTri2D.okl
+++ b/solvers/lbs/okl/lbsCollisionTri2D.okl
@@ -35,21 +35,21 @@ void equiDist2D(const dfloat w,  const dfloat ex, const dfloat ey,
 
 // Compute collision step, physical velocity and scaled external forcing
 @kernel void lbsCollisionTri2D(const dlong Nelements,
-			       // @restrict const  dlong  *  elementIds,
-			       const dfloat t,
-			       const dfloat dt,
-			       const dfloat gamma, // lambda/dt
-			       const dfloat nu, // 1/Re
-			       @restrict const dfloat * LBM, 
-			       @restrict const dfloat * x, 
-			       @restrict const dfloat * y, 
-			       @restrict const dfloat * z, 
-			       @restrict const dfloat *  F,
-			       @restrict const dfloat *  U, 
-			       @restrict dfloat *  q){
+                               // @restrict const  dlong  *  elementIds,
+                               const dfloat t,
+                               const dfloat dt,
+                               const dfloat gamma, // lambda/dt
+                               const dfloat nu, // 1/Re
+                               @restrict const dfloat * LBM,
+                               @restrict const dfloat * x,
+                               @restrict const dfloat * y,
+                               @restrict const dfloat * z,
+                               @restrict const dfloat *  F,
+                               @restrict const dfloat *  U,
+                               @restrict dfloat *  q){
   for(dlong eo=0;eo<Nelements;eo+=p_NblockV;@outer(0)){  // for all elements
-    // @exclusive dlong e; 
-    // @exclusive dfloat r_u, r_v,r_r; 
+    // @exclusive dlong e;
+    // @exclusive dfloat r_u, r_v,r_r;
 
     for(int es=0;es<p_NblockV;++es;@inner(1)){// for all elements in block
       for(int n=0;n<p_Np;++n;@inner(0)){     // for all nodes in this element
@@ -60,56 +60,56 @@ void equiDist2D(const dfloat w,  const dfloat ex, const dfloat ey,
           const dlong idf  = e*p_Nfields*p_Np + n;
           const dlong idn  = e*p_Nmacro*p_Np  + n;
 
-          const dfloat rn = U[idn +0*p_Np]; 
-          const dfloat un = U[idn +1*p_Np]; 
-          const dfloat vn = U[idn +2*p_Np]; 
+          const dfloat rn = U[idn +0*p_Np];
+          const dfloat un = U[idn +1*p_Np];
+          const dfloat vn = U[idn +2*p_Np];
 
 #pragma unroll p_Nfields
           for(int fld=0; fld<p_Nfields;++fld){
-            const dfloat ew = LBM[fld + 0*p_Nfields]; 
-            const dfloat ex = LBM[fld + 1*p_Nfields]; 
-            const dfloat ey = LBM[fld + 2*p_Nfields]; 
+            const dfloat ew = LBM[fld + 0*p_Nfields];
+            const dfloat ex = LBM[fld + 1*p_Nfields];
+            const dfloat ey = LBM[fld + 2*p_Nfields];
             //
-            const dfloat fn = F[idf+fld*p_Np]; // external forcing 
-            const dfloat qn = q[idf+fld*p_Np]; 
-            
-            dfloat qeq = 0.f; 
+            const dfloat fn = F[idf+fld*p_Np]; // external forcing
+            const dfloat qn = q[idf+fld*p_Np];
+
+            dfloat qeq = 0.f;
             equiDist2D(ew, ex, ey, rn, un, vn, &qeq);
-            
+
             // Compute forcing term using unmodified equilibrium distribution
-            const dfloat qext = fn*qeq*dt;  
-            
+            const dfloat qext = fn*qeq*dt;
+
             // modify equilibrium forcing here
-            qeq -= 0.5*dt*fn; 
-            
-            // collision 
-            q[idf+fld*p_Np] += (qext - 1.f/(gamma + 0.5f)*( qn - qeq)); 
+            qeq -= 0.5*dt*fn;
+
+            // collision
+            q[idf+fld*p_Np] += (qext - 1.f/(gamma + 0.5f)*( qn - qeq));
           }
         }
       }
     }
-  }   
+  }
 }
 
 
 
-// Compute physical velocity and 
-// scaled external forcing i.e. 
+// Compute physical velocity and
+// scaled external forcing i.e.
 // (v-u)\dot G /(rho\timesc^2)
 
 @kernel void lbsForcingTri2D(const dlong Nelements,
-			     // @restrict const  dlong  *  elementIds,
-			     const dfloat t,
-			     const dfloat dt,
-			     const dfloat gamma, // lambda/dt
-			     const dfloat nu, // 1/Re
-			     @restrict const dfloat * LBM, 
-			     @restrict const dfloat * x, 
-			     @restrict const dfloat * y, 
-			     @restrict const dfloat * z, 
-			     @restrict const dfloat * q, 
-			     @restrict dfloat *  F,
-			     @restrict dfloat *  U){
+                             // @restrict const  dlong  *  elementIds,
+                             const dfloat t,
+                             const dfloat dt,
+                             const dfloat gamma, // lambda/dt
+                             const dfloat nu, // 1/Re
+                             @restrict const dfloat * LBM,
+                             @restrict const dfloat * x,
+                             @restrict const dfloat * y,
+                             @restrict const dfloat * z,
+                             @restrict const dfloat * q,
+                             @restrict dfloat *  F,
+                             @restrict dfloat *  U){
 
   for(dlong eo=0;eo<Nelements;eo+=p_NblockV;@outer(0)){  // for all elements
 
@@ -122,62 +122,62 @@ void equiDist2D(const dfloat w,  const dfloat ex, const dfloat ey,
           const dlong idf = e*p_Nfields*p_Np + n;
           const dlong idn = e*p_Nmacro*p_Np  + n;
 
-          const dfloat xn = x[e*p_Np +n]; 
-          const dfloat yn = y[e*p_Np +n]; 
+          const dfloat xn = x[e*p_Np +n];
+          const dfloat yn = y[e*p_Np +n];
 
           // old velocities
-          dfloat rn =  U[idn + 0*p_Np]; 
-          dfloat un =  U[idn + 1*p_Np]; 
-          dfloat vn =  U[idn + 2*p_Np]; 
+          dfloat rn =  U[idn + 0*p_Np];
+          dfloat un =  U[idn + 1*p_Np];
+          dfloat vn =  U[idn + 2*p_Np];
 
           // Compute force here!!!!
           // !!! This is currenlty only force, can be modified
-          dfloat fx = 0.f, fy = 0.f; // Use previous rn un vn to compute fx, fy 
-          lbsBodyForce2D(nu, t, xn, yn, rn, un, vn, &fx, &fy); 
-          
-          // Now update velocity 
-          rn =0.f, un =0.f, vn =0.f; 
+          dfloat fx = 0.f, fy = 0.f; // Use previous rn un vn to compute fx, fy
+          lbsBodyForce2D(nu, t, xn, yn, rn, un, vn, &fx, &fy);
+
+          // Now update velocity
+          rn =0.f, un =0.f, vn =0.f;
 #pragma unroll p_Nfields
           for(int fld=0; fld<p_Nfields;++fld){
-            // const dfloat ew = LBM[fld + 0*p_Nfields]; 
-            const dfloat ex = LBM[fld + 1*p_Nfields]; 
-            const dfloat ey = LBM[fld + 2*p_Nfields]; 
-            
-            const dfloat qn = q[idf+fld*p_Np]; 
+            // const dfloat ew = LBM[fld + 0*p_Nfields];
+            const dfloat ex = LBM[fld + 1*p_Nfields];
+            const dfloat ey = LBM[fld + 2*p_Nfields];
+
+            const dfloat qn = q[idf+fld*p_Np];
             rn  += qn; // density
-            un  += ex*qn; 
-            vn  += ey*qn; 
+            un  += ex*qn;
+            vn  += ey*qn;
           }
 
           // !!! This is currenlty only force, can be modified
           un  = (un + 0.5*fx*dt)/rn;
           vn  = (vn + 0.5*fy*dt)/rn;
-          
 
-          for(int fld=0; fld<p_Nfields;++fld){         
-            const dfloat ex = LBM[fld + 1*p_Nfields]; 
-            const dfloat ey = LBM[fld + 2*p_Nfields]; 
-            F[idf +fld*p_Np] =  1.f/rn*p_ic2*( (ex-un)*fx + (ey-vn)*fy ); 
+
+          for(int fld=0; fld<p_Nfields;++fld){
+            const dfloat ex = LBM[fld + 1*p_Nfields];
+            const dfloat ey = LBM[fld + 2*p_Nfields];
+            F[idf +fld*p_Np] =  1.f/rn*p_ic2*( (ex-un)*fx + (ey-vn)*fy );
           }
 
-          U[idn + 0*p_Np] = rn; 
-          U[idn + 1*p_Np] = un; 
-          U[idn + 2*p_Np] = vn; 
+          U[idn + 0*p_Np] = rn;
+          U[idn + 1*p_Np] = un;
+          U[idn + 2*p_Np] = vn;
 
         }
       }
     }
-  }   
+  }
 }
 
 
 
 // Compute collision step and update velocity field
 @kernel void lbsMomentsTri2D(const dlong Nelements,
-			     // @restrict const  dlong  *  elementIds,
-			     @restrict const dfloat * LBM, 
-			     @restrict const dfloat *  q,
-			     @restrict dfloat *  U){
+                             // @restrict const  dlong  *  elementIds,
+                             @restrict const dfloat * LBM,
+                             @restrict const dfloat *  q,
+                             @restrict dfloat *  U){
 
   for(dlong eo=0;eo<Nelements;eo+=p_NblockV;@outer(0)){  // for all elements
 
@@ -189,25 +189,25 @@ void equiDist2D(const dfloat w,  const dfloat ex, const dfloat ey,
           const dlong e = et;
           const dlong id = e*p_Nfields*p_Np + n;
 
-          dfloat rn = 0.f; 
-          dfloat un = 0.f; 
-          dfloat vn = 0.f; 
+          dfloat rn = 0.f;
+          dfloat un = 0.f;
+          dfloat vn = 0.f;
 
 #pragma unroll p_Nfields
           for(int fld=0; fld<p_Nfields;++fld){
-            const dfloat qn = q[id+fld*p_Np]; 
+            const dfloat qn = q[id+fld*p_Np];
             rn  += qn; // density
-            un  += LBM[fld + 1*p_Nfields]*qn; 
-            vn  += LBM[fld + 2*p_Nfields]*qn; 
+            un  += LBM[fld + 1*p_Nfields]*qn;
+            vn  += LBM[fld + 2*p_Nfields]*qn;
           }
           un = un/rn; //x-velocity
           vn = vn/rn; //y-velocity
 
           const dlong idn = e*p_Nmacro*p_Np + n;
 
-          U[idn + 0*p_Np] = rn; 
-          U[idn + 1*p_Np] = un; 
-          U[idn + 2*p_Np] = vn; 
+          U[idn + 0*p_Np] = rn;
+          U[idn + 1*p_Np] = un;
+          U[idn + 2*p_Np] = vn;
 
         }
       }
@@ -219,35 +219,35 @@ void equiDist2D(const dfloat w,  const dfloat ex, const dfloat ey,
 
 // Compute phase field from mocaro field
 @kernel void lbsPhaseFieldTri2D(const dlong Nelements,
-				// @restrict const  dlong  *  elementIds,
-				const dfloat t,
-				const dfloat dt,
-				const dfloat gamma, // lambda/dt
-				const dfloat nu, // 1/Re
-				@restrict const dfloat * LBM, 
-				@restrict const dfloat * x, 
-				@restrict const dfloat * y, 
-				@restrict const dfloat * z, 
-				@restrict const dfloat *  U,
-				@restrict dfloat *  q){
+                                // @restrict const  dlong  *  elementIds,
+                                const dfloat t,
+                                const dfloat dt,
+                                const dfloat gamma, // lambda/dt
+                                const dfloat nu, // 1/Re
+                                @restrict const dfloat * LBM,
+                                @restrict const dfloat * x,
+                                @restrict const dfloat * y,
+                                @restrict const dfloat * z,
+                                @restrict const dfloat *  U,
+                                @restrict dfloat *  q){
 
   for(dlong eo=0;eo<Nelements;eo+=p_NblockV;@outer(0)){  // for all elements
-    // @exclusive dlong e; 
+    // @exclusive dlong e;
     for(int es=0;es<p_NblockV;++es;@inner(1)){// for all elements in block
       for(int n=0;n<p_Np;++n;@inner(0)){     // for all nodes in this element
         const dlong et = eo+es; // element in block
         if(et<Nelements){
-	  // const dlong e = elementIds[et];
-	  const dlong e = et;
+          // const dlong e = elementIds[et];
+          const dlong e = et;
 
-	  const dlong idf = e*p_Nfields*p_Np + n;
-	  const dlong idn = e*p_Nmacro*p_Np + n;
+          const dlong idf = e*p_Nfields*p_Np + n;
+          const dlong idn = e*p_Nmacro*p_Np + n;
 
-          const dfloat rn = U[idn + 0*p_Np]; 
-          const dfloat un = U[idn + 1*p_Np]; 
-          const dfloat vn = U[idn + 2*p_Np]; 
+          const dfloat rn = U[idn + 0*p_Np];
+          const dfloat un = U[idn + 1*p_Np];
+          const dfloat vn = U[idn + 2*p_Np];
 
-	  dfloat fx = 0.f, fy = 0.f;
+          dfloat fx = 0.f, fy = 0.f;
           lbsBodyForce2D(nu, t, xn, yn, rn, un, vn, &fx, &fy); 
 
 
diff --git a/solvers/lbs/okl/lbsInitialCondition2D.okl b/solvers/lbs/okl/lbsInitialCondition2D.okl
index 28781713f..cea6e8dff 100644
--- a/solvers/lbs/okl/lbsInitialCondition2D.okl
+++ b/solvers/lbs/okl/lbsInitialCondition2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/lbs/okl/lbsInitialCondition3D.okl b/solvers/lbs/okl/lbsInitialCondition3D.okl
index 2fd6174a3..5b9877974 100644
--- a/solvers/lbs/okl/lbsInitialCondition3D.okl
+++ b/solvers/lbs/okl/lbsInitialCondition3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solvers/lbs/okl/lbsSurfaceHex3D.okl b/solvers/lbs/okl/lbsSurfaceHex3D.okl
index 680767ded..bc2dd0eaa 100644
--- a/solvers/lbs/okl/lbsSurfaceHex3D.okl
+++ b/solvers/lbs/okl/lbsSurfaceHex3D.okl
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -134,24 +134,24 @@ void surfaceTerms(const int e,
 
 
 @kernel void lbsSurfaceHex3D(const dlong Nelements,
-			      // @restrict const  dlong  *  elementIds,
-			      @restrict const  dfloat *  sgeo,
-			      @restrict const  dfloat *  LIFT,
-			      @restrict const  dlong  *  vmapM,
-			      @restrict const  dlong  *  vmapP,
-			      @restrict const  int    *  EToB,
-			      @restrict const  dfloat *  x,
-			      @restrict const  dfloat *  y,
-			      @restrict const  dfloat *  z,
-			      const dfloat dt,
-			      const dfloat time,
-			      const dfloat nu,
-			      @restrict const  int *  LMAP,
-			      @restrict const  dfloat *  LBM,
-			      @restrict const  dfloat *  F,
-			      @restrict const  dfloat *  U,
-			      @restrict const  dfloat *  q,
-			      @restrict dfloat *  rhsq){
+                              // @restrict const  dlong  *  elementIds,
+                              @restrict const  dfloat *  sgeo,
+                              @restrict const  dfloat *  LIFT,
+                              @restrict const  dlong  *  vmapM,
+                              @restrict const  dlong  *  vmapP,
+                              @restrict const  int    *  EToB,
+                              @restrict const  dfloat *  x,
+                              @restrict const  dfloat *  y,
+                              @restrict const  dfloat *  z,
+                              const dfloat dt,
+                              const dfloat time,
+                              const dfloat nu,
+                              @restrict const  int *  LMAP,
+                              @restrict const  dfloat *  LBM,
+                              @restrict const  dfloat *  F,
+                              @restrict const  dfloat *  U,
+                              @restrict const  dfloat *  q,
+                              @restrict dfloat *  rhsq){
 
   // for all elements
   for(dlong et=0;et<Nelements;++et;@outer(0)){
@@ -170,7 +170,6 @@ void surfaceTerms(const int e,
           }
       }
     }
-    @barrier("local");
 
     // face 0 & 5
     for(int j=0;j<p_Nq;++j;@inner(1)){
@@ -188,7 +187,6 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("local");
 
      // face 1 & 3
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -204,7 +202,6 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 2 & 4
     for(int k=0;k<p_Nq;++k;@inner(1)){
@@ -221,7 +218,6 @@ void surfaceTerms(const int e,
       }
 
 
-   @barrier("local");
 
     for(int j=0;j<p_Nq;++j;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
diff --git a/solvers/lbs/okl/lbsSurfaceQuad2D.okl b/solvers/lbs/okl/lbsSurfaceQuad2D.okl
index 0f4ec4bc3..a934a0097 100644
--- a/solvers/lbs/okl/lbsSurfaceQuad2D.okl
+++ b/solvers/lbs/okl/lbsSurfaceQuad2D.okl
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -95,58 +95,58 @@ void surfaceTerms(const int e,
     vm   = U[uidM+2*p_Np];  
 
     lbsBoundaryConditions2D(bc, nu, time, x[idM], y[idM], nx, ny, 
-			    rm, um, vm, 
-			    &rb, &ub, &vb); 
+                            rm, um, vm,
+                            &rb, &ub, &vb);
   }
 
-  dfloat qm[p_Nfields]; 
-  dfloat qp[p_Nfields]; 
+  dfloat qm[p_Nfields];
+  dfloat qp[p_Nfields];
 
 #pragma unroll p_Nfields
   for(int fld=0; fld<p_Nfields;++fld){
     qm[fld] = q[qidM + fld*p_Np];
     qp[fld] = q[qidP + fld*p_Np];
   }
-  
+
 #pragma unroll p_Nfields
   for(int fld=0; fld<p_Nfields;++fld){
-    const dfloat fn = F[qidM + fld*p_Np]; 
-    const dfloat ex = LBM[fld + 1*p_Nfields]; 
-    const dfloat ey = LBM[fld + 2*p_Nfields]; 
+    const dfloat fn = F[qidM + fld*p_Np];
+    const dfloat ex = LBM[fld + 1*p_Nfields];
+    const dfloat ey = LBM[fld + 2*p_Nfields];
     const dfloat en = ex*nx + ey*ny;
 
     if(bc>0){
       const dfloat ew   =  LBM[fld + 0*p_Nfields];
       const int   idr   =  LMAP[fld];
-      applyBC2D(bc, fld, idr, dt, en, nx, ny, ew, ex, ey,rb, ub, vb, fn, qm, qp); 
+      applyBC2D(bc, fld, idr, dt, en, nx, ny, ew, ex, ey,rb, ub, vb, fn, qm, qp);
     }
     s_fluxq[es][fld][j][i] += 0.5f*sc*(en -fabs(en))*(qm[fld] - qp[fld]);
   }
 }
- 
 
-             
+
+
 
 
 @kernel void lbsSurfaceQuad2D(const dlong Nelements,
-			      // @restrict const  dlong  *  elementIds,
-			      @restrict const  dfloat *  sgeo,
-			      @restrict const  dfloat *  LIFT,
-			      @restrict const  dlong  *  vmapM,
-			      @restrict const  dlong  *  vmapP,
-			      @restrict const  int    *  EToB,
-			      @restrict const  dfloat *  x,
-			      @restrict const  dfloat *  y,
-			      @restrict const  dfloat *  z,
-			      const dfloat dt,
-			      const dfloat time,
-			      const dfloat nu,
-			      @restrict const  int *  LMAP,
-			      @restrict const  dfloat *  LBM,
-			      @restrict const  dfloat *  F,
-			      @restrict const  dfloat *  U,
-			      @restrict const  dfloat *  q,
-			      @restrict dfloat *  rhsq){
+                              // @restrict const  dlong  *  elementIds,
+                              @restrict const  dfloat *  sgeo,
+                              @restrict const  dfloat *  LIFT,
+                              @restrict const  dlong  *  vmapM,
+                              @restrict const  dlong  *  vmapP,
+                              @restrict const  int    *  EToB,
+                              @restrict const  dfloat *  x,
+                              @restrict const  dfloat *  y,
+                              @restrict const  dfloat *  z,
+                              const dfloat dt,
+                              const dfloat time,
+                              const dfloat nu,
+                              @restrict const  int *  LMAP,
+                              @restrict const  dfloat *  LBM,
+                              @restrict const  dfloat *  F,
+                              @restrict const  dfloat *  U,
+                              @restrict const  dfloat *  q,
+                              @restrict dfloat *  rhsq){
 
   // for all elements
   for(dlong eo=0;eo<Nelements;eo+=p_NblockS;@outer(0)){
@@ -159,15 +159,14 @@ void surfaceTerms(const int e,
     for(int es=0;es<p_NblockS;++es;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
 #pragma unroll p_Nq
-	for(int j=0;j<p_Nq;++j){
+        for(int j=0;j<p_Nq;++j){
 #pragma unroll p_Nfields
-	  for(int fld=0; fld<p_Nfields; fld++)
-	    s_fluxq[es][fld][j][i] = 0.f;
-	}
+          for(int fld=0; fld<p_Nfields; fld++)
+            s_fluxq[es][fld][j][i] = 0.f;
+        }
       }
     }
 
-    @barrier("local");
 
     // face 0 & 2
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -175,7 +174,7 @@ void surfaceTerms(const int e,
         const dlong et = eo + es;
         if(et<Nelements){
           // const dlong   e = elementIds[et];
-          const dlong   e = et; 
+          const dlong   e = et;
           const dlong sk0 = e*p_Nfp*p_Nfaces + 0*p_Nfp + i;
           const dlong sk2 = e*p_Nfp*p_Nfaces + 2*p_Nfp + i;
 
@@ -188,7 +187,6 @@ void surfaceTerms(const int e,
       }
     }
 
-    @barrier("local");
 
     // face 1 & 3
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -196,35 +194,34 @@ void surfaceTerms(const int e,
         const dlong et = eo + es;
         if(et<Nelements){
           // const dlong   e = elementIds[et];
-          const dlong   e = et; 
+          const dlong   e = et;
           const dlong sk1 = e*p_Nfp*p_Nfaces + 1*p_Nfp + j;
           const dlong sk3 = e*p_Nfp*p_Nfaces + 3*p_Nfp + j;
 
           surfaceTerms(e, es, sk1, 1, p_Nq-1, j,
-		       sgeo, nu, time, dt, x, y, vmapM, vmapP, EToB, LMAP, LBM, F, U, q, s_fluxq);
+                       sgeo, nu, time, dt, x, y, vmapM, vmapP, EToB, LMAP, LBM, F, U, q, s_fluxq);
 
           surfaceTerms(e, es, sk3, 3, 0, j,
-		       sgeo, nu, time, dt, x, y, vmapM, vmapP, EToB, LMAP, LBM, F, U, q, s_fluxq);
+                       sgeo, nu, time, dt, x, y, vmapM, vmapP, EToB, LMAP, LBM, F, U, q, s_fluxq);
         }
       }
     }
 
 
-    @barrier("local");
 
     for(int es=0;es<p_NblockS;++es;@inner(1)){
       for(int i=0;i<p_Nq;++i;@inner(0)){
         const dlong et = eo + es;
         if(et<Nelements){
 #pragma unroll p_Nq
-	  for(int j=0;j<p_Nq;++j){
-	    // const dlong   e   = elementIds[et];
-	    const dlong   e = et; 
-	    const dlong rhsId = e*p_Np*p_Nfields+j*p_Nq+i;
-	    for(int fld=0; fld<p_Nfields; fld++){
-	      rhsq[rhsId+fld*p_Np] += s_fluxq[es][fld][j][i];
-	    }
-	  }
+          for(int j=0;j<p_Nq;++j){
+            // const dlong   e   = elementIds[et];
+            const dlong   e = et;
+            const dlong rhsId = e*p_Np*p_Nfields+j*p_Nq+i;
+            for(int fld=0; fld<p_Nfields; fld++){
+              rhsq[rhsId+fld*p_Np] += s_fluxq[es][fld][j][i];
+            }
+          }
         }
       }
     }
diff --git a/solvers/lbs/okl/lbsSurfaceTet3D.okl b/solvers/lbs/okl/lbsSurfaceTet3D.okl
index e5f9b055c..d5b6815fd 100644
--- a/solvers/lbs/okl/lbsSurfaceTet3D.okl
+++ b/solvers/lbs/okl/lbsSurfaceTet3D.okl
@@ -50,118 +50,116 @@ if (bc==1 || bc==2 || bc==4 || bc==5){
     @exclusive dlong e;
 
     // for all face nodes of all elements
-      for(int n=0;n<p_maxNodes;++n;@inner(0)){ // maxNodes = max(Nfp*Nfaces,Np)
-        if(et<Nelements){
-          // e = elementIds[et];
-          e = et; 
-          if(n<p_Nfp*p_Nfaces){
-            // find face that owns this node
-            const int face = n/p_Nfp;
-
-            // load surface geofactors for this face
-            const dlong sid   = p_Nsgeo*(e*p_Nfaces+face);
-            const dfloat nx   = sgeo[sid+p_NXID];
-            const dfloat ny   = sgeo[sid+p_NYID];
-            const dfloat nz   = sgeo[sid+p_NZID];
-            const dfloat sJ   = sgeo[sid+p_SJID];
-            const dfloat invJ = sgeo[sid+p_IJID];
-
-            const dfloat sc = sJ*invJ;
-
-            const dlong id  = e*p_Nfp*p_Nfaces + n;
-            const dlong idM = vmapM[id];
-            const dlong idP = vmapP[id];
-
-            // load traces
-            const dlong eM = e;
-            const dlong eP = idP/p_Np;
-            const int vidM = idM%p_Np;
-            const int vidP = idP%p_Np;
-
-            const dlong qidM = eM*p_Np*p_Nfields + vidM;
-            const dlong qidP = eP*p_Np*p_Nfields + vidP;
-                
-            // apply boundary condition
-            const int bc = EToB[face+p_Nfaces*e];
-            
-            dfloat rm, um, vm, wm, rb, ub, vb, wb; 
-
-            if(bc>0 || bc==-1){
-              const dlong uidM = eM*p_Np*p_Nmacro + vidM; 
-              rm   = U[uidM+ 0*p_Np];  
-              um   = U[uidM+ 1*p_Np];  
-              vm   = U[uidM+ 2*p_Np];  
-              wm   = U[uidM+ 3*p_Np];  
-
-             lbsBoundaryConditions3D(bc, nu, time, x[idM], y[idM], z[idM], nx, ny, nz,  
-                                     rm, um, vm, wm,
-                                     &rb, &ub, &vb, &wb); 
-            }
+    for(int n=0;n<p_maxNodes;++n;@inner(0)){ // maxNodes = max(Nfp*Nfaces,Np)
+      if(et<Nelements){
+        // e = elementIds[et];
+        e = et;
+        if(n<p_Nfp*p_Nfaces){
+          // find face that owns this node
+          const int face = n/p_Nfp;
+
+          // load surface geofactors for this face
+          const dlong sid   = p_Nsgeo*(e*p_Nfaces+face);
+          const dfloat nx   = sgeo[sid+p_NXID];
+          const dfloat ny   = sgeo[sid+p_NYID];
+          const dfloat nz   = sgeo[sid+p_NZID];
+          const dfloat sJ   = sgeo[sid+p_SJID];
+          const dfloat invJ = sgeo[sid+p_IJID];
+
+          const dfloat sc = sJ*invJ;
+
+          const dlong id  = e*p_Nfp*p_Nfaces + n;
+          const dlong idM = vmapM[id];
+          const dlong idP = vmapP[id];
+
+          // load traces
+          const dlong eM = e;
+          const dlong eP = idP/p_Np;
+          const int vidM = idM%p_Np;
+          const int vidP = idP%p_Np;
+
+          const dlong qidM = eM*p_Np*p_Nfields + vidM;
+          const dlong qidP = eP*p_Np*p_Nfields + vidP;
+
+          // apply boundary condition
+          const int bc = EToB[face+p_Nfaces*e];
+
+          dfloat rm, um, vm, wm, rb, ub, vb, wb;
+
+          if(bc>0 || bc==-1){
+            const dlong uidM = eM*p_Np*p_Nmacro + vidM;
+            rm   = U[uidM+ 0*p_Np];
+            um   = U[uidM+ 1*p_Np];
+            vm   = U[uidM+ 2*p_Np];
+            wm   = U[uidM+ 3*p_Np];
+
+           lbsBoundaryConditions3D(bc, nu, time, x[idM], y[idM], z[idM], nx, ny, nz,
+                                   rm, um, vm, wm,
+                                   &rb, &ub, &vb, &wb);
+          }
 
 
-            dfloat qm[p_Nfields]; 
-            dfloat qp[p_Nfields]; 
+          dfloat qm[p_Nfields];
+          dfloat qp[p_Nfields];
 
-            #pragma unroll p_Nfields
-            for(int fld=0; fld<p_Nfields;++fld){
-              qm[fld] = q[qidM + fld*p_Np];
-              qp[fld] = q[qidP + fld*p_Np];
-            }
+          #pragma unroll p_Nfields
+          for(int fld=0; fld<p_Nfields;++fld){
+            qm[fld] = q[qidM + fld*p_Np];
+            qp[fld] = q[qidP + fld*p_Np];
+          }
 
 
-            #pragma unroll p_Nfields
-            for(int fld=0; fld<p_Nfields;++fld){
-              const dfloat fn = F[qidM + fld*p_Np]; 
-              const dfloat ex = LBM[fld + 1*p_Nfields]; 
-              const dfloat ey = LBM[fld + 2*p_Nfields]; 
-              const dfloat ez = LBM[fld + 3*p_Nfields]; 
-              const dfloat en = ex*nx + ey*ny + ez*nz; // need to modify for bc
-
-              if(bc>0){
-                const dfloat ew   =  LBM[fld + 0*p_Nfields];
-                const int   idr   =  LMAP[fld];
-                applyBC3D(bc, fld, idr, dt, en, nx, ny, nz, ew, ex, ey, ez, rb, ub, vb, wb, fn, qm, qp); 
-              }
-             
-              s_fluxq[fld][n] = 0.5f*sc*(en -fabs(en))*(qm[fld] - qp[fld]);
+          #pragma unroll p_Nfields
+          for(int fld=0; fld<p_Nfields;++fld){
+            const dfloat fn = F[qidM + fld*p_Np];
+            const dfloat ex = LBM[fld + 1*p_Nfields];
+            const dfloat ey = LBM[fld + 2*p_Nfields];
+            const dfloat ez = LBM[fld + 3*p_Nfields];
+            const dfloat en = ex*nx + ey*ny + ez*nz; // need to modify for bc
+
+            if(bc>0){
+              const dfloat ew   =  LBM[fld + 0*p_Nfields];
+              const int   idr   =  LMAP[fld];
+              applyBC3D(bc, fld, idr, dt, en, nx, ny, nz, ew, ex, ey, ez, rb, ub, vb, wb, fn, qm, qp);
             }
+
+            s_fluxq[fld][n] = 0.5f*sc*(en -fabs(en))*(qm[fld] - qp[fld]);
           }
         }
       }
+    }
 
-    // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
 
-    // for each node in the element
-      for(int n=0;n<p_maxNodes;++n;@inner(0)){
-          if(n<p_Np){
-            // const int id = nrhs*p_Nfields*(p_Np*e + n) + p_Nfields*shift;
-            const dlong id = e*p_Nfields*p_Np + n ;
+  // for each node in the element
+    for(int n=0;n<p_maxNodes;++n;@inner(0)){
+      if(n<p_Np){
+        // const int id = nrhs*p_Nfields*(p_Np*e + n) + p_Nfields*shift;
+        const dlong id = e*p_Nfields*p_Np + n ;
 
-            dfloat rhsf[p_Nfields]; 
-            #pragma unroll p_Nfields
-            for(int fld=0; fld<p_Nfields; ++fld){
-              rhsf[fld] = rhsq[id+fld*p_Np];
-            }
-
-            // rhs += LIFT*((sJ/J)*(A*nx+B*ny)*(q^* - q^-))
-            #pragma unroll p_NfacesNfp
-              for(int m=0;m<p_NfacesNfp;++m){
-                dfloat L = LIFT[n+m*p_Np];
+        dfloat rhsf[p_Nfields];
+        #pragma unroll p_Nfields
+        for(int fld=0; fld<p_Nfields; ++fld){
+          rhsf[fld] = rhsq[id+fld*p_Np];
+        }
 
-                #pragma unroll p_Nfields
-                for(int fld=0; fld<p_Nfields; ++fld){
-                 rhsf[fld] += L*s_fluxq[fld][m];
-                }
-                
-              }
+        // rhs += LIFT*((sJ/J)*(A*nx+B*ny)*(q^* - q^-))
+        #pragma unroll p_NfacesNfp
+          for(int m=0;m<p_NfacesNfp;++m){
+            dfloat L = LIFT[n+m*p_Np];
 
             #pragma unroll p_Nfields
             for(int fld=0; fld<p_Nfields; ++fld){
-             rhsq[id+fld*p_Np]= rhsf[fld];
+             rhsf[fld] += L*s_fluxq[fld][m];
             }
 
           }
+
+        #pragma unroll p_Nfields
+        for(int fld=0; fld<p_Nfields; ++fld){
+         rhsq[id+fld*p_Np]= rhsf[fld];
         }
+
       }
-}
\ No newline at end of file
+    }
+  }
+}
diff --git a/solvers/lbs/okl/lbsSurfaceTri2D.okl b/solvers/lbs/okl/lbsSurfaceTri2D.okl
index c95129486..4b934cb9a 100644
--- a/solvers/lbs/okl/lbsSurfaceTri2D.okl
+++ b/solvers/lbs/okl/lbsSurfaceTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -154,8 +154,6 @@ if (bc==1 || bc==2 || bc==4 || bc==5){
       }
     }
 
-    // wait for all @shared memory writes of the previous inner loop to complete
-    @barrier("local");
 
     // for each node in the element
     for(int es=0;es<p_NblockS;++es;@inner(1)){
@@ -347,4 +345,4 @@ if (bc==1 || bc==2 || bc==4 || bc==5){
 //       }
 //     }
 //   }
-// }
\ No newline at end of file
+// }
diff --git a/solvers/lbs/okl/lbsVolumeHex3D.okl b/solvers/lbs/okl/lbsVolumeHex3D.okl
index b6773d5dc..e1f4315de 100644
--- a/solvers/lbs/okl/lbsVolumeHex3D.okl
+++ b/solvers/lbs/okl/lbsVolumeHex3D.okl
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -26,19 +26,19 @@
 
 
 @kernel void lbsVolumeHex3D(const dlong Nelements,
-			     // @restrict const  dlong  *  elementIds,
-			     @restrict const  dfloat *  vgeo,
-			     @restrict const  dfloat *  DT,
-			     @restrict const  dfloat * x,
-			     @restrict const  dfloat * y,
-			     @restrict const  dfloat * z,
-			     const dfloat t,
-			     const dfloat nu,
-			     const dfloat gamma,
-			     @restrict const  dfloat *  LBM,
-			     @restrict const  dfloat *  q,
-			     @restrict const  dfloat *  U,
-			     @restrict dfloat *  rhsq){
+                             // @restrict const  dlong  *  elementIds,
+                             @restrict const  dfloat *  vgeo,
+                             @restrict const  dfloat *  DT,
+                             @restrict const  dfloat * x,
+                             @restrict const  dfloat * y,
+                             @restrict const  dfloat * z,
+                             const dfloat t,
+                             const dfloat nu,
+                             const dfloat gamma,
+                             @restrict const  dfloat *  LBM,
+                             @restrict const  dfloat *  q,
+                             @restrict const  dfloat *  U,
+                             @restrict dfloat *  rhsq){
 
   for(dlong et=0;et<Nelements;++et;@outer(0)){  // for all elements
 
@@ -51,28 +51,26 @@
         for(int i=0;i<p_Nq;++i;@inner(0)){
           // e = elementIds[et];
           e = et;
-	    		const dlong idf = i + j*p_Nq + k*p_Nq*p_Nq + p_Nfields*p_Np*e;
+          const dlong idf = i + j*p_Nq + k*p_Nq*p_Nq + p_Nfields*p_Np*e;
 
-					#pragma unroll p_Nfields
-	    		for(int fld=0; fld<p_Nfields;++fld){
-	      		s_q[fld][k][j][i] = q[idf+fld*p_Np];
-	    		}
+#pragma unroll p_Nfields
+          for(int fld=0; fld<p_Nfields;++fld){
+            s_q[fld][k][j][i] = q[idf+fld*p_Np];
+          }
 
-	  			if(k==0)
+          if(k==0)
             s_DT[j][i] = DT[j*p_Nq+i];
-				}
+        }
       }
     }
 
-   // make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int k=0;k<p_Nq;++k;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
           const dlong gid   = e*p_Np*p_Nvgeo + k*p_Nq*p_Nq + j*p_Nq +i;
 
-	   			const dfloat drdx = vgeo[gid + p_RXID*p_Np];
+          const dfloat drdx = vgeo[gid + p_RXID*p_Np];
           const dfloat drdy = vgeo[gid + p_RYID*p_Np];
           const dfloat drdz = vgeo[gid + p_RZID*p_Np];
 
@@ -84,9 +82,9 @@
           const dfloat dtdy = vgeo[gid + p_TYID*p_Np];
           const dfloat dtdz = vgeo[gid + p_TZID*p_Np];
 
-	      // compute 'r' and 's' derivatives of (q_m) at node n
-      	dfloat r_dqdr[p_Nfields], r_dqds[p_Nfields], r_dqdt[p_Nfields];
-      	dfloat r_dqdx[p_Nfields], r_dqdy[p_Nfields], r_dqdz[p_Nfields];
+          // compute 'r' and 's' derivatives of (q_m) at node n
+          dfloat r_dqdr[p_Nfields], r_dqds[p_Nfields], r_dqdt[p_Nfields];
+          dfloat r_dqdx[p_Nfields], r_dqdy[p_Nfields], r_dqdz[p_Nfields];
 
           #pragma unroll p_Nfields
             for(int fld=0; fld<p_Nfields;++fld){
@@ -94,7 +92,7 @@
               r_dqdx[fld] = 0.f, r_dqdy[fld] = 0.f, r_dqdz[fld] = 0.f;
             }
 
-					#pragma unroll p_Nq
+          #pragma unroll p_Nq
           for(int m=0;m<p_Nq;++m){
             const dfloat Dim = s_DT[i][m];
             const dfloat Djm = s_DT[j][m];
@@ -126,14 +124,14 @@
 
           const dlong idf = i + j*p_Nq + k*p_Nq*p_Nq + p_Nfields*p_Np*e;
 
-	    for(int fld=0; fld<p_Nfields;++fld){
-            const dfloat ex = LBM[fld + 1*p_Nfields]; 
-            const dfloat ey = LBM[fld + 2*p_Nfields]; 
-            const dfloat ez = LBM[fld + 3*p_Nfields]; 
+          for(int fld=0; fld<p_Nfields;++fld){
+            const dfloat ex = LBM[fld + 1*p_Nfields];
+            const dfloat ey = LBM[fld + 2*p_Nfields];
+            const dfloat ez = LBM[fld + 3*p_Nfields];
             rhsq[idf + fld*p_Np] = -(ex*r_dqdx[fld] + ey*r_dqdy[fld] + ez*r_dqdz[fld]);
           }
-	     }
-	    }
+        }
+      }
     }
   }
- }
\ No newline at end of file
+}
diff --git a/solvers/lbs/okl/lbsVolumeQuad2D.okl b/solvers/lbs/okl/lbsVolumeQuad2D.okl
index c32842ee0..be530bd02 100644
--- a/solvers/lbs/okl/lbsVolumeQuad2D.okl
+++ b/solvers/lbs/okl/lbsVolumeQuad2D.okl
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -26,19 +26,19 @@
 
 
 @kernel void lbsVolumeQuad2D(const dlong Nelements,
-			     // @restrict const  dlong  *  elementIds,
-			     @restrict const  dfloat *  vgeo,
-			     @restrict const  dfloat *  DT,
-			     @restrict const  dfloat * x,
-			     @restrict const  dfloat * y,
-			     @restrict const  dfloat * z,
-			     const dfloat t,
-			     const dfloat nu,
-			     const dfloat gamma,
-			     @restrict const  dfloat *  LBM,
-			     @restrict const  dfloat *  q,
-			     @restrict const  dfloat *  U,
-			     @restrict dfloat *  rhsq){
+                             // @restrict const  dlong  *  elementIds,
+                             @restrict const  dfloat *  vgeo,
+                             @restrict const  dfloat *  DT,
+                             @restrict const  dfloat * x,
+                             @restrict const  dfloat * y,
+                             @restrict const  dfloat * z,
+                             const dfloat t,
+                             const dfloat nu,
+                             const dfloat gamma,
+                             @restrict const  dfloat *  LBM,
+                             @restrict const  dfloat *  q,
+                             @restrict const  dfloat *  U,
+                             @restrict dfloat *  rhsq){
 
   for(dlong eo=0;eo<Nelements;eo+=p_NblockV;@outer(0)){  // for all elements
 
@@ -46,46 +46,44 @@
     @shared dfloat s_DT[p_Nq][p_Nq];
     @exclusive dlong e;
 
-    // @exclusive dfloat fg[p_NblockV][p_Nfields]; 
+    // @exclusive dfloat fg[p_NblockV][p_Nfields];
 
     for(int es=0;es<p_NblockV;++es;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
-	  const dlong et = eo+es; // element in block
-	  if(et<Nelements){
-	    // e = elementIds[et];
-	    e = et;
-	    const dlong idf = i + j*p_Nq + e*p_Nfields*p_Np;
+          const dlong et = eo+es; // element in block
+          if(et<Nelements){
+            // e = elementIds[et];
+            e = et;
+            const dlong idf = i + j*p_Nq + e*p_Nfields*p_Np;
 
 #pragma unroll p_Nfields
-	    for(int fld=0; fld<p_Nfields;++fld){
-	      s_q[es][fld][j][i] = q[idf+fld*p_Np];
-	    }
-	  }
+            for(int fld=0; fld<p_Nfields;++fld){
+              s_q[es][fld][j][i] = q[idf+fld*p_Np];
+            }
+          }
 
-	  if(es==0)
+          if(es==0)
             s_DT[j][i] = DT[j*p_Nq+i];
-	}
+        }
       }
     }
 
-    // make sure all node data is loaded into @shared
-    @barrier("local");
     for(int es=0;es<p_NblockV;++es;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
-	  			const dlong et = eo+es; // element in block
-	  if(et<Nelements){
-	    // prefetch geometric factors (constant on triangle)
-	    const dlong gid   = e*p_Np*p_Nvgeo+ j*p_Nq +i;
-	    const dfloat drdx = vgeo[gid + p_RXID*p_Np];
-	    const dfloat drdy = vgeo[gid + p_RYID*p_Np];
-	    const dfloat dsdx = vgeo[gid + p_SXID*p_Np];
-	    const dfloat dsdy = vgeo[gid + p_SYID*p_Np];
-
-	    // compute 'r' and 's' derivatives of (q_m) at node n
-	    dfloat r_dqdr[p_Nfields], r_dqds[p_Nfields];
-	    dfloat r_dqdx[p_Nfields], r_dqdy[p_Nfields];
+                                const dlong et = eo+es; // element in block
+          if(et<Nelements){
+            // prefetch geometric factors (constant on triangle)
+            const dlong gid   = e*p_Np*p_Nvgeo+ j*p_Nq +i;
+            const dfloat drdx = vgeo[gid + p_RXID*p_Np];
+            const dfloat drdy = vgeo[gid + p_RYID*p_Np];
+            const dfloat dsdx = vgeo[gid + p_SXID*p_Np];
+            const dfloat dsdy = vgeo[gid + p_SYID*p_Np];
+
+            // compute 'r' and 's' derivatives of (q_m) at node n
+            dfloat r_dqdr[p_Nfields], r_dqds[p_Nfields];
+            dfloat r_dqdx[p_Nfields], r_dqdy[p_Nfields];
 
 #pragma unroll p_Nfields
             for(int fld=0; fld<p_Nfields;++fld){
@@ -109,23 +107,23 @@
                 r_dqds[fld] += Djm*s_q[es][fld][m][i];
             }
 
-	    // Compute derivatives in physical coordinates
+            // Compute derivatives in physical coordinates
 #pragma unroll p_Nfields
             for(int fld=0; fld<p_Nfields;++fld){
               r_dqdx[fld] = drdx*r_dqdr[fld] + dsdx*r_dqds[fld];
               r_dqdy[fld] = drdy*r_dqdr[fld] + dsdy*r_dqds[fld];
             }
 
-	    // Update
-	    const dlong idf = i + j*p_Nq + e*p_Nfields*p_Np;
+            // Update
+            const dlong idf = i + j*p_Nq + e*p_Nfields*p_Np;
 
-	    for(int fld=0; fld<p_Nfields;++fld){
-	      const dfloat ex = LBM[fld + 1*p_Nfields]; 
-	      const dfloat ey = LBM[fld + 2*p_Nfields]; 
-	      rhsq[idf + fld*p_Np] = -(ex*r_dqdx[fld] + ey*r_dqdy[fld]);
-	    }
-	  }
-	}
+            for(int fld=0; fld<p_Nfields;++fld){
+              const dfloat ex = LBM[fld + 1*p_Nfields];
+              const dfloat ey = LBM[fld + 2*p_Nfields];
+              rhsq[idf + fld*p_Np] = -(ex*r_dqdx[fld] + ey*r_dqdy[fld]);
+            }
+          }
+        }
       }
     }
   }
diff --git a/solvers/lbs/okl/lbsVolumeTet3D.okl b/solvers/lbs/okl/lbsVolumeTet3D.okl
index b61bab255..e538b5230 100644
--- a/solvers/lbs/okl/lbsVolumeTet3D.okl
+++ b/solvers/lbs/okl/lbsVolumeTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -45,22 +45,20 @@ SOFTWARE.
     @shared dfloat s_q[p_Nfields][p_Np];
     @exclusive dlong e;
 
-      for(int n=0;n<p_Np;++n;@inner(0)){     // for all nodes in this element
-          // e = elementIds[et];
-          e = et;
-          const dlong idf = e*p_Nfields*p_Np + n;
-          
-          #pragma unroll p_Nfields
-          for(int fld=0; fld<p_Nfields;++fld){
-            s_q[fld][n] = q[idf+fld*p_Np];
-          }
-        }
-    // make sure all node data is loaded into @shared
-    @barrier("local");
+    for(int n=0;n<p_Np;++n;@inner(0)){     // for all nodes in this element
+      // e = elementIds[et];
+      e = et;
+      const dlong idf = e*p_Nfields*p_Np + n;
+
+      #pragma unroll p_Nfields
+      for(int fld=0; fld<p_Nfields;++fld){
+        s_q[fld][n] = q[idf+fld*p_Np];
+      }
+    }
 
-      for(int n=0;n<p_Np;++n;@inner(0)){     // for all nodes in this element
-        if(et<Nelements){
-           // prefetch geometric factors (constant on triangle)
+    for(int n=0;n<p_Np;++n;@inner(0)){     // for all nodes in this element
+      if(et<Nelements){
+        // prefetch geometric factors (constant on triangle)
         const dfloat drdx = vgeo[e*p_Nvgeo + p_RXID];
         const dfloat drdy = vgeo[e*p_Nvgeo + p_RYID];
         const dfloat drdz = vgeo[e*p_Nvgeo + p_RZID];
@@ -71,47 +69,47 @@ SOFTWARE.
         const dfloat dtdy = vgeo[e*p_Nvgeo + p_TYID];
         const dfloat dtdz = vgeo[e*p_Nvgeo + p_TZID];
 
-                // compute 'r' and 's' derivatives of (q_m) at node n
-      dfloat r_dqdr[p_Nfields], r_dqds[p_Nfields], r_dqdt[p_Nfields];
-      dfloat r_dqdx[p_Nfields], r_dqdy[p_Nfields], r_dqdz[p_Nfields];
-
-          #pragma unroll p_Nfields
-            for(int fld=0; fld<p_Nfields;++fld){
-              r_dqdr[fld] = 0.f, r_dqds[fld] = 0.f, r_dqdt[fld] = 0.f;
-              r_dqdx[fld] = 0.f, r_dqdy[fld] = 0.f, r_dqdz[fld] = 0.f;
-            }
+        // compute 'r' and 's' derivatives of (q_m) at node n
+        dfloat r_dqdr[p_Nfields], r_dqds[p_Nfields], r_dqdt[p_Nfields];
+        dfloat r_dqdx[p_Nfields], r_dqdy[p_Nfields], r_dqdz[p_Nfields];
 
-          #pragma unroll p_Np
-            for(int i=0;i<p_Np;++i){
-              const dfloat Drni = D[n+i*p_Np+0*p_Np*p_Np];
-              const dfloat Dsni = D[n+i*p_Np+1*p_Np*p_Np];
-              const dfloat Dtni = D[n+i*p_Np+2*p_Np*p_Np];
-              #pragma unroll p_Nfields
-              for(int fld=0; fld<p_Nfields;++fld){
-                r_dqdr[fld] += Drni*s_q[fld][i];
-                r_dqds[fld] += Dsni*s_q[fld][i];
-                r_dqdt[fld] += Dtni*s_q[fld][i];
-              }
-            }
+        #pragma unroll p_Nfields
+          for(int fld=0; fld<p_Nfields;++fld){
+            r_dqdr[fld] = 0.f, r_dqds[fld] = 0.f, r_dqdt[fld] = 0.f;
+            r_dqdx[fld] = 0.f, r_dqdy[fld] = 0.f, r_dqdz[fld] = 0.f;
+          }
 
-          // Compute derivatives in physical coordinates
-          #pragma unroll p_Nfields
+        #pragma unroll p_Np
+          for(int i=0;i<p_Np;++i){
+            const dfloat Drni = D[n+i*p_Np+0*p_Np*p_Np];
+            const dfloat Dsni = D[n+i*p_Np+1*p_Np*p_Np];
+            const dfloat Dtni = D[n+i*p_Np+2*p_Np*p_Np];
+            #pragma unroll p_Nfields
             for(int fld=0; fld<p_Nfields;++fld){
-              r_dqdx[fld] = drdx*r_dqdr[fld] + dsdx*r_dqds[fld] + dtdx*r_dqdt[fld];
-              r_dqdy[fld] = drdy*r_dqdr[fld] + dsdy*r_dqds[fld] + dtdy*r_dqdt[fld];
-              r_dqdz[fld] = drdz*r_dqdr[fld] + dsdz*r_dqds[fld] + dtdz*r_dqdt[fld];
+              r_dqdr[fld] += Drni*s_q[fld][i];
+              r_dqds[fld] += Dsni*s_q[fld][i];
+              r_dqdt[fld] += Dtni*s_q[fld][i];
             }
+          }
 
-          // Update
-          const dlong idf = e*p_Nfields*p_Np + n;
-
+        // Compute derivatives in physical coordinates
+        #pragma unroll p_Nfields
           for(int fld=0; fld<p_Nfields;++fld){
-            const dfloat ex = LBM[fld + 1*p_Nfields]; 
-            const dfloat ey = LBM[fld + 2*p_Nfields]; 
-            const dfloat ez = LBM[fld + 3*p_Nfields]; 
-            rhsq[idf + fld*p_Np] = -(ex*r_dqdx[fld] + ey*r_dqdy[fld] + ez*r_dqdz[fld]);
+            r_dqdx[fld] = drdx*r_dqdr[fld] + dsdx*r_dqds[fld] + dtdx*r_dqdt[fld];
+            r_dqdy[fld] = drdy*r_dqdr[fld] + dsdy*r_dqds[fld] + dtdy*r_dqdt[fld];
+            r_dqdz[fld] = drdz*r_dqdr[fld] + dsdz*r_dqds[fld] + dtdz*r_dqdt[fld];
           }
+
+        // Update
+        const dlong idf = e*p_Nfields*p_Np + n;
+
+        for(int fld=0; fld<p_Nfields;++fld){
+          const dfloat ex = LBM[fld + 1*p_Nfields];
+          const dfloat ey = LBM[fld + 2*p_Nfields];
+          const dfloat ez = LBM[fld + 3*p_Nfields];
+          rhsq[idf + fld*p_Np] = -(ex*r_dqdx[fld] + ey*r_dqdy[fld] + ez*r_dqdz[fld]);
         }
       }
     }
-  }
\ No newline at end of file
+  }
+}
diff --git a/solvers/lbs/okl/lbsVolumeTri2D.okl b/solvers/lbs/okl/lbsVolumeTri2D.okl
index 5ec984064..0ce0ee83b 100644
--- a/solvers/lbs/okl/lbsVolumeTri2D.okl
+++ b/solvers/lbs/okl/lbsVolumeTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -64,8 +64,6 @@ SOFTWARE.
       }
     }
 
-    // make sure all node data is loaded into @shared
-    @barrier("local");
 
     for(int es=0;es<p_NblockV;++es;@inner(1)){// for all elements in block
       for(int n=0;n<p_Np;++n;@inner(0)){     // for all nodes in this element
@@ -119,4 +117,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/lbs/okl/lbsVorticityHex3D.okl b/solvers/lbs/okl/lbsVorticityHex3D.okl
index 27da9791c..8eb6dede0 100644
--- a/solvers/lbs/okl/lbsVorticityHex3D.okl
+++ b/solvers/lbs/okl/lbsVorticityHex3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -55,8 +55,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
-
     for(int k=0;k<p_Nq;++k;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -114,4 +112,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/lbs/okl/lbsVorticityQuad2D.okl b/solvers/lbs/okl/lbsVorticityQuad2D.okl
index 180a8cf1c..6c8f1ce94 100644
--- a/solvers/lbs/okl/lbsVorticityQuad2D.okl
+++ b/solvers/lbs/okl/lbsVorticityQuad2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -54,8 +54,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
-
     for(int es=0;es<p_NblockV;++es;@inner(2)){
       for(int j=0;j<p_Nq;++j;@inner(1)){
         for(int i=0;i<p_Nq;++i;@inner(0)){
@@ -91,4 +89,4 @@ SOFTWARE.
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/solvers/lbs/okl/lbsVorticityTet3D.okl b/solvers/lbs/okl/lbsVorticityTet3D.okl
index fb8994c9e..64ca15018 100644
--- a/solvers/lbs/okl/lbsVorticityTet3D.okl
+++ b/solvers/lbs/okl/lbsVorticityTet3D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -49,8 +49,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
-
     for(int es=0;es<p_NblockV;++es;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
         const dlong e = eo+es;
diff --git a/solvers/lbs/okl/lbsVorticityTri2D.okl b/solvers/lbs/okl/lbsVorticityTri2D.okl
index 354cb7c94..588e74f5f 100644
--- a/solvers/lbs/okl/lbsVorticityTri2D.okl
+++ b/solvers/lbs/okl/lbsVorticityTri2D.okl
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -47,8 +47,6 @@ SOFTWARE.
       }
     }
 
-    @barrier("local");
-
     for(int es=0;es<p_NblockV;++es;@inner(1)){
       for(int n=0;n<p_Np;++n;@inner(0)){
         const dlong e = eo+es;
diff --git a/solvers/lbs/src/lbsLatticeSetup.cpp b/solvers/lbs/src/lbsLatticeSetup.cpp
index 7e3b72b3b..54df964f2 100644
--- a/solvers/lbs/src/lbsLatticeSetup.cpp
+++ b/solvers/lbs/src/lbsLatticeSetup.cpp
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -36,8 +36,8 @@ void lbs_t::latticeSetup(){
     Nfields  = 9; 
     Nmacro = mesh.dim + 1;
     
-    LBM   = (dfloat *) calloc(Nfields*Nmacro,sizeof(dfloat)); 
-    LMAP  = (int *) calloc(Nfields,sizeof(int)); 
+    LBM.malloc(Nfields*Nmacro);
+    LMAP.malloc(Nfields);
 
     LBM[0 + 0*Nfields] = 4.0/9.0; 
     LBM[0 + 1*Nfields] = 0.0; 
@@ -64,8 +64,8 @@ void lbs_t::latticeSetup(){
     Nfields  = 15; 
     Nmacro   = mesh.dim + 1;
 
-    LBM   = (dfloat *) calloc(Nfields*Nmacro,sizeof(dfloat)); 
-    LMAP  = (int *) calloc(Nfields,sizeof(int)); 
+    LBM.malloc(Nfields*Nmacro);
+    LMAP.malloc(Nfields);
     // Weights
     LBM[0  + 0*Nfields] = 2.0/ 9.0; LMAP[0 ] = 0; 
 
@@ -109,8 +109,8 @@ void lbs_t::latticeSetup(){
     Nfields  = 19; 
     Nmacro   = mesh.dim + 1;
 
-    LBM   = (dfloat *) calloc(Nfields*Nmacro,sizeof(dfloat)); 
-    LMAP  = (int *) calloc(Nfields,sizeof(int)); 
+    LBM.malloc(Nfields*Nmacro);
+    LMAP.malloc(Nfields);
     // Weights
     LBM[0  + 0*Nfields] = 1.0/ 3.0; LMAP[0 ] = 0; 
 
@@ -160,10 +160,14 @@ void lbs_t::latticeSetup(){
     LBM[18  + 1*Nfields] =  0.0; LBM[18  + 2*Nfields] =  1.0; LBM[18  + 3*Nfields] = -1.0;
 
   }else {
-    LIBP_ABORT(string("Requested VELOCIY MODEL not found."));
+    LIBP_FORCE_ABORT("Requested VELOCIY MODEL not found.");
   }
 
   alpha    = nu/(c*c); // Relaxation parameter 
   tauInv   = 1.0/alpha; // need to check that....
   RT       = c*c; // remove this later, no need!!!!!
+
+  // Lattice-Boltzmann Model
+  o_LBM = platform.malloc<dfloat>(LBM);
+  o_LMAP = platform.malloc<dfloat>(LMAP);
 }
diff --git a/solvers/lbs/src/lbsPlotFields.cpp b/solvers/lbs/src/lbsPlotFields.cpp
index 75e17e849..ca266f601 100644
--- a/solvers/lbs/src/lbsPlotFields.cpp
+++ b/solvers/lbs/src/lbsPlotFields.cpp
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -27,11 +27,11 @@
 #include "lbs.hpp"
 
 // interpolate data to plot nodes and save to file (one per process)
-void lbs_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){
+void lbs_t::PlotFields(memory<dfloat>& Q, memory<dfloat>& V, std::string fileName){
 
   FILE *fp;
 
-  fp = fopen(fileName, "w");
+  fp = fopen(fileName.c_str(), "w");
 
   fprintf(fp, "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"BigEndian\">\n");
   fprintf(fp, "  <UnstructuredGrid>\n");
@@ -44,36 +44,42 @@ void lbs_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){
   fprintf(fp, "        <DataArray type=\"Float32\" NumberOfComponents=\"3\" Format=\"ascii\">\n");
 
   //scratch space for interpolation
-  size_t NscratchBytes = mymax(mesh.Np, mesh.plotNp)*sizeof(dfloat);
-  dfloat* scratch = (dfloat *) malloc(2*NscratchBytes);
+  size_t Nscratch = std::max(mesh.Np, mesh.plotNp);
+  memory<dfloat> scratch(2*Nscratch);
 
-  dfloat* Ix = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iy = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iz = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
+  memory<dfloat> Ix(mesh.plotNp);
+  memory<dfloat> Iy(mesh.plotNp);
+  memory<dfloat> Iz(mesh.plotNp);
 
   // compute plot node coordinates on the fly
   for(dlong e=0;e<mesh.Nelements;++e){
     mesh.PlotInterp(mesh.x + e*mesh.Np, Ix, scratch);
     mesh.PlotInterp(mesh.y + e*mesh.Np, Iy, scratch);
-    mesh.PlotInterp(mesh.z + e*mesh.Np, Iz, scratch);
+    if(mesh.dim==3)
+      mesh.PlotInterp(mesh.z + e*mesh.Np, Iz, scratch);
 
-    for(int n=0;n<mesh.plotNp;++n){
-      fprintf(fp, "       ");
-      fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],Iz[n]);
+    if (mesh.dim==2) {
+      for(int n=0;n<mesh.plotNp;++n){
+        fprintf(fp, "       ");
+        fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],0.0);
+      }
+    } else {
+      for(int n=0;n<mesh.plotNp;++n){
+        fprintf(fp, "       ");
+        fprintf(fp, "%g %g %g\n", Ix[n],Iy[n],Iz[n]);
+      }
     }
   }
   fprintf(fp, "        </DataArray>\n");
   fprintf(fp, "      </Points>\n");
 
-  free(Ix); free(Iy); free(Iz);
-
-  dfloat* Ir = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iu = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iv = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
-  dfloat* Iw = (dfloat *) malloc(mesh.plotNp*sizeof(dfloat));
+  memory<dfloat> Ir(mesh.plotNp);
+  memory<dfloat> Iu(mesh.plotNp);
+  memory<dfloat> Iv(mesh.plotNp);
+  memory<dfloat> Iw(mesh.plotNp);
 
   fprintf(fp, "      <PointData Scalars=\"scalars\">\n");
-  if (U!=nullptr) {
+  if (U.length()!=0) {
     // write out velocity
     fprintf(fp, "        <DataArray type=\"Float32\" Name=\"Velocity\" NumberOfComponents=\"%d\" Format=\"ascii\">\n", mesh.dim);
     for(dlong e=0;e<mesh.Nelements;++e){
@@ -94,7 +100,7 @@ void lbs_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){
     fprintf(fp, "       </DataArray>\n");
   }
 
-  if (U!=nullptr) {
+  if (U.length()!=0) {
     // write out pressure
     fprintf(fp, "        <DataArray type=\"Float32\" Name=\"Density\" Format=\"ascii\">\n");
     for(dlong e=0;e<mesh.Nelements;++e){
@@ -108,7 +114,7 @@ void lbs_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){
     fprintf(fp, "       </DataArray>\n");
   }
 
-  if (V!=nullptr) {
+  if (V.length()!=0) {
     // write out vorticity
     if(mesh.dim==2){
       fprintf(fp, "        <DataArray type=\"Float32\" Name=\"Vorticity\" Format=\"ascii\">\n");
@@ -138,8 +144,6 @@ void lbs_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){
   }
   fprintf(fp, "     </PointData>\n");
 
-  free(Ir); free(Iu); free(Iv); free(Iw);
-
   fprintf(fp, "    <Cells>\n");
   fprintf(fp, "      <DataArray type=\"Int32\" Name=\"connectivity\" Format=\"ascii\">\n");
 
@@ -180,6 +184,4 @@ void lbs_t::PlotFields(dfloat* Q, dfloat *V, char *fileName){
   fprintf(fp, "  </UnstructuredGrid>\n");
   fprintf(fp, "</VTKFile>\n");
   fclose(fp);
-
-  free(scratch);
 }
diff --git a/solvers/lbs/src/lbsPmlSetup.cpp b/solvers/lbs/src/lbsPmlSetup.cpp
index af00e57c2..b4796acb3 100644
--- a/solvers/lbs/src/lbsPmlSetup.cpp
+++ b/solvers/lbs/src/lbsPmlSetup.cpp
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -108,7 +108,7 @@ void lbs_t::PmlSetup(){
     int pmlNp = (pmlcubature) ? mesh.cubNp : mesh.Np;
     int pmlNq = (pmlcubature) ? mesh.cubNq : mesh.Nq;
 
-    dfloat *pmlr, *pmls, *pmlt;
+    memory<dfloat> pmlr, pmls, pmlt;
     if(pmlcubature){
       pmlr = mesh.cubr;
       pmls = mesh.cubs;
@@ -121,27 +121,27 @@ void lbs_t::PmlSetup(){
 
     // printf("Setting PML Coefficient \n");
     //set up damping parameter
-    pmlSigma = (dfloat *) calloc(mesh.dim*mesh.NpmlElements*pmlNp,sizeof(dfloat));
+    pmlSigma.malloc(mesh.dim*mesh.NpmlElements*pmlNp);
 
     for (dlong m=0;m<mesh.NpmlElements;m++){
       dlong e     = mesh.pmlElements[m];
       hlong type  = mesh.elementInfo[e];
 
       //element vertices
-      const dfloat *xe = mesh.EX + e*mesh.Nverts;
-      const dfloat *ye = mesh.EY + e*mesh.Nverts;
-      const dfloat *ze = mesh.EZ + e*mesh.Nverts;
+      memory<dfloat> xe = mesh.EX + e*mesh.Nverts;
+      memory<dfloat> ye = mesh.EY + e*mesh.Nverts;
+      memory<dfloat> ze = mesh.EZ + e*mesh.Nverts;
 
       for(int n=0;n<pmlNp;++n){ /* for each node */
         dfloat x  = 0, y  = 0, z  = 0;
         dfloat rn = 0, sn = 0, tn = 0;
-        if(mesh.elementType==TRIANGLES){
+        if(mesh.elementType==Mesh::TRIANGLES){
           rn = pmlr[n];
           sn = pmls[n];
 
           x = -0.5*(rn+sn)*xe[0] + 0.5*(1+rn)*xe[1] + 0.5*(1+sn)*xe[2];
           y = -0.5*(rn+sn)*ye[0] + 0.5*(1+rn)*ye[1] + 0.5*(1+sn)*ye[2];
-        } else if(mesh.elementType==QUADRILATERALS){
+        } else if(mesh.elementType==Mesh::QUADRILATERALS){
           const int i = n%pmlNq;
           const int j = n/pmlNq;
           rn = pmlr[i];
@@ -149,7 +149,7 @@ void lbs_t::PmlSetup(){
 
           x =  0.25*( (1.0-rn)*(1-sn)*xe[0]+(1.0-rn)*(1+sn)*xe[1]+(1.0+rn)*(1+sn)*xe[2]+(1.0+rn)*(1-sn)*xe[3]);
           y =  0.25*( (1.0-rn)*(1-sn)*ye[0]+(1.0-rn)*(1+sn)*ye[1]+(1.0+rn)*(1+sn)*ye[2]+(1.0+rn)*(1-sn)*ye[3]);
-        } else if(mesh.elementType==TETRAHEDRA){
+        } else if(mesh.elementType==Mesh::TETRAHEDRA){
           rn = pmlr[n];
           sn = pmls[n];
           tn = pmlt[n];
@@ -157,7 +157,7 @@ void lbs_t::PmlSetup(){
           x = -0.5*(rn+sn+tn+1)*xe[0] + 0.5*(1+rn)*xe[1] + 0.5*(1+sn)*xe[2] + 0.5*(tn+1)*xe[3];
           y = -0.5*(rn+sn+tn+1)*ye[0] + 0.5*(1+rn)*ye[1] + 0.5*(1+sn)*ye[2] + 0.5*(tn+1)*ye[3];
           z = -0.5*(rn+sn+tn+1)*ze[0] + 0.5*(1+rn)*ze[1] + 0.5*(1+sn)*ze[2] + 0.5*(tn+1)*ze[3];
-        } else if(mesh.elementType==HEXAHEDRA){
+        } else if(mesh.elementType==Mesh::HEXAHEDRA){
           const int i = n%pmlNq;
           const int j = (n/pmlNq)%pmlNq;
           const int k = (n/pmlNq)/pmlNq;
@@ -237,6 +237,6 @@ void lbs_t::PmlSetup(){
 
     // printf("# of PML elements: %d and # of Non-PML elements: %d \n",mesh.NpmlElements, mesh.Nelements-mesh.NpmlElements);
     if (mesh.NpmlElements)
-      o_pmlSigma = platform.malloc(mesh.dim*mesh.NpmlElements*pmlNp*sizeof(dfloat),pmlSigma);
+      o_pmlSigma = platform.malloc<dfloat>(pmlSigma);
   }
 }
diff --git a/solvers/lbs/src/lbsReport.cpp b/solvers/lbs/src/lbsReport.cpp
index a2de81dd0..b249a764e 100644
--- a/solvers/lbs/src/lbsReport.cpp
+++ b/solvers/lbs/src/lbsReport.cpp
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -38,7 +38,7 @@ void lbs_t::Report(dfloat time, int tstep){
   mesh.MassMatrixApply(o_U, o_Mq);
 
   dlong Nentries = mesh.Nelements*mesh.Np*Nmacro;
-  dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_q, o_Mq, mesh.comm));
+  dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_q, o_Mq, mesh.comm));
 
   if(mesh.rank==0)
     printf("%5.2f (%d), %5.4f (time, timestep, norm)\n", time, tstep, norm2);
@@ -51,12 +51,12 @@ void lbs_t::Report(dfloat time, int tstep){
     o_Vort.copyTo(Vort);
 
     // output field files
-    string name;
+    std::string name;
     settings.getSetting("OUTPUT FILE NAME", name);
     char fname[BUFSIZ];
     sprintf(fname, "%s_%04d_%04d.vtu", name.c_str(), mesh.rank, frame++);
 
     // PlotFields(o_q, Vort, fname);
-    PlotFields(U, Vort, fname);
+    PlotFields(U, Vort, std::string(fname));
   }
 }
diff --git a/solvers/lbs/src/lbsRun.cpp b/solvers/lbs/src/lbsRun.cpp
index 1f352ef04..75b8e5bd0 100644
--- a/solvers/lbs/src/lbsRun.cpp
+++ b/solvers/lbs/src/lbsRun.cpp
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -64,8 +64,8 @@ void lbs_t::Run(){
                    mesh.o_x,
                    mesh.o_y,
                    mesh.o_z,
-		   o_U, 
-		   o_q);
+                   o_U,
+                   o_q);
 
   /*
     Artificial warping of time step size for multirate testing
@@ -75,9 +75,9 @@ void lbs_t::Run(){
       settings.compareSetting("TIME INTEGRATOR","MRSAAB3"))
     dt /= (1<<(mesh.mrNlevels-1));
 #endif
-  timeStepper->SetTimeStep(dt);
+  timeStepper.SetTimeStep(dt);
 
-  timeStepper->Run(o_q, startTime, finalTime);
+  timeStepper.Run(*this, o_q, startTime, finalTime);
 
 
   // output norm of final solution
@@ -88,7 +88,7 @@ void lbs_t::Run(){
     mesh.MassMatrixApply(o_U, o_Mq);
 
     dlong Nentries = mesh.Nelements*mesh.Np*Nmacro;
-    dfloat norm2 = sqrt(platform.linAlg.innerProd(Nentries, o_q, o_Mq, mesh.comm));
+    dfloat norm2 = sqrt(platform.linAlg().innerProd(Nentries, o_q, o_Mq, mesh.comm));
 
     if(mesh.rank==0)
       printf("Solution norm = %17.15lg\n", norm2);
diff --git a/solvers/lbs/src/lbsSettings.cpp b/solvers/lbs/src/lbsSettings.cpp
index 6c43eb175..f34abf8ea 100644
--- a/solvers/lbs/src/lbsSettings.cpp
+++ b/solvers/lbs/src/lbsSettings.cpp
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -27,7 +27,7 @@
 #include "lbs.hpp"
 
 //settings for lbs solver
-lbsSettings_t::lbsSettings_t(MPI_Comm& _comm):
+lbsSettings_t::lbsSettings_t(comm_t& _comm):
   settings_t(_comm) {
 
   newSetting("DATA FILE",
@@ -96,10 +96,7 @@ lbsSettings_t::lbsSettings_t(MPI_Comm& _comm):
 
 void lbsSettings_t::report() {
 
-  int rank;
-  MPI_Comm_rank(comm, &rank);
-
-  if (rank==0) {
+  if (comm.rank()==0) {
     std::cout << "LBS Settings:\n\n";
     reportSetting("DATA FILE");
     // reportSetting("SPEED OF SOUND");
@@ -120,15 +117,15 @@ void lbsSettings_t::report() {
 
 void lbsSettings_t::parseFromFile(platformSettings_t& platformSettings,
                                   meshSettings_t& meshSettings,
-                                  const string filename) {
+                                  const std::string filename) {
   //read all settings from file
   settings_t s(comm);
   s.readSettingsFromFile(filename);
 
   for(auto it = s.settings.begin(); it != s.settings.end(); ++it) {
-    setting_t* set = it->second;
-    const string name = set->getName();
-    const string val = set->getVal<string>();
+    setting_t& set = it->second;
+    const std::string name = set.getName();
+    const std::string val = set.getVal<std::string>();
     if (platformSettings.hasSetting(name))
       platformSettings.changeSetting(name, val);
     else if (meshSettings.hasSetting(name))
@@ -136,9 +133,7 @@ void lbsSettings_t::parseFromFile(platformSettings_t& platformSettings,
     else if (hasSetting(name)) //self
       changeSetting(name, val);
     else  {
-      stringstream ss;
-      ss << "Unknown setting: [" << name << "] requested";
-      LIBP_ABORT(ss.str());
+      LIBP_FORCE_ABORT("Unknown setting: [" << name << "] requested");
     }
   }
 }
diff --git a/solvers/lbs/src/lbsSetup.cpp b/solvers/lbs/src/lbsSetup.cpp
index 682236cb7..0b18b950f 100644
--- a/solvers/lbs/src/lbsSetup.cpp
+++ b/solvers/lbs/src/lbsSetup.cpp
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -28,185 +28,162 @@
 #define D2Q9 1
 #define D3Q15 2
 
+void lbs_t::Setup(platform_t& _platform, mesh_t& _mesh,
+                  lbsSettings_t& _settings){
 
-lbs_t& lbs_t::Setup(platform_t& platform, mesh_t& mesh,
-                    lbsSettings_t& settings){
+  platform = _platform;
+  mesh = _mesh;
+  comm = _mesh.comm;
+  settings = _settings;
 
-  lbs_t* lbs = new lbs_t(platform, mesh, settings);
+  //Trigger JIT kernel builds
+  ogs::InitializeKernels(platform, ogs::Dfloat, ogs::Add);
 
   // Set reference lattice-Boltzmann data  
-  lbs->latticeSetup();
+  latticeSetup();
   
-  lbs->Npmlfields = mesh.dim*lbs->Nfields;
+  Npmlfields = mesh.dim*Nfields;
 
   // AK: not in use yet ... Setup PML
-  // lbs->PmlSetup();
+  // PmlSetup();
 
   //setup timeStepper
-  dlong Nlocal = mesh.Nelements*mesh.Np*lbs->Nfields;
-  dlong Nhalo  = mesh.totalHaloPairs*mesh.Np*lbs->Nfields;
+  dlong Nlocal = mesh.Nelements*mesh.Np*Nfields;
+  dlong Nhalo  = mesh.totalHaloPairs*mesh.Np*Nfields;
 
   //make array of time step estimates for each element
-  dfloat *EtoDT = (dfloat *) calloc(mesh.Nelements,sizeof(dfloat));
-  dfloat vmax = lbs->MaxWaveSpeed();
+  memory<dfloat> EtoDT(mesh.Nelements);
+  dfloat vmax = MaxWaveSpeed();
   for(dlong e=0;e<mesh.Nelements;++e){
     dfloat h = mesh.ElementCharacteristicLength(e);
     dfloat dtAdv  = h/(vmax*(mesh.N+1.)*(mesh.N+1.));
     EtoDT[e] = dtAdv;
   }
 
-
-
   if (settings.compareSetting("TIME INTEGRATOR","LSERK4")){
-    lbs->timeStepper = new TimeStepper::lserk4(mesh.Nelements, mesh.totalHaloPairs,
-					       mesh.Np, lbs->Nfields, *lbs);
+    timeStepper.Setup<TimeStepper::lserk4>(mesh.Nelements,
+                                           mesh.totalHaloPairs,
+                                           mesh.Np, Nfields,
+                                           platform, comm);
   }else {
-    LIBP_ABORT(string("Requested TIME INTEGRATOR not found."));
+    LIBP_FORCE_ABORT("Requested TIME INTEGRATOR not found.");
   }
 
   
   //setup linear algebra module
-  platform.linAlg.InitKernels({"innerProd"});
+  platform.linAlg().InitKernels({"innerProd"});
 
   /*setup trace halo exchange */
-  lbs->traceHalo = mesh.HaloTraceSetup(lbs->Nfields);
+  traceHalo = mesh.HaloTraceSetup(Nfields);
 
   // compute samples of q at interpolation nodes
-  lbs->q = (dfloat*) calloc(Nlocal+Nhalo, sizeof(dfloat));
-  lbs->o_q = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), lbs->q);
+  q.malloc(Nlocal+Nhalo, 0.0);
+  o_q = platform.malloc<dfloat>(q);
 
-  lbs->F = (dfloat*) calloc(Nlocal+Nhalo, sizeof(dfloat));
-  lbs->o_F = platform.malloc((Nlocal+Nhalo)*sizeof(dfloat), lbs->F);
+  F.malloc(Nlocal+Nhalo, 0.0);
+  o_F = platform.malloc<dfloat>(F);
 
 
-  lbs->Vort = (dfloat*) calloc(mesh.dim*mesh.Nelements*mesh.Np, sizeof(dfloat));
-  lbs->o_Vort = platform.malloc((mesh.dim*mesh.Nelements*mesh.Np)*sizeof(dfloat),
-				lbs->Vort);
+  Vort.malloc(mesh.dim*mesh.Nelements*mesh.Np, 0.0);
+  o_Vort = platform.malloc<dfloat>(Vort);
 
   // Hold macro quantites i.e. density + velocities
-  lbs->U = (dfloat*) calloc((mesh.Nelements+mesh.totalHaloPairs)*mesh.Np*lbs->Nmacro, sizeof(dfloat));
-  lbs->o_U = platform.malloc((mesh.Nelements+mesh.totalHaloPairs)*mesh.Np*lbs->Nmacro*sizeof(dfloat), lbs->U);
-  
-  // Lattice-Boltzmann Model
-  lbs->o_LBM = platform.malloc(lbs->Nfields*lbs->Nmacro*sizeof(dfloat), lbs->LBM);
-  lbs->o_LMAP = platform.malloc(lbs->Nfields*sizeof(int), lbs->LMAP);
-
+  U.malloc((mesh.Nelements+mesh.totalHaloPairs)*mesh.Np*Nmacro, 0.0);
+  o_U = platform.malloc<dfloat>(U);
 
-  
   //storage for M*q during reporting
-  lbs->o_Mq = platform.malloc((mesh.Nelements+mesh.totalHaloPairs)*mesh.Np*lbs->Nmacro*sizeof(dfloat), lbs->U);
-  mesh.MassMatrixKernelSetup(lbs->Nmacro); // mass matrix operator
+  o_Mq = platform.malloc<dfloat>(U);
+  mesh.MassMatrixKernelSetup(Nmacro); // mass matrix operator
 
   // // OCCA build stuff
-  occa::properties kernelInfo = mesh.props; //copy base occa properties
+  properties_t kernelInfo = mesh.props; //copy base occa properties
 
   //add boundary data to kernel info
-  string dataFileName;
+  std::string dataFileName;
   settings.getSetting("DATA FILE", dataFileName);
   kernelInfo["includes"] += dataFileName;
 
-  kernelInfo["defines/" "p_Nfields"]= lbs->Nfields;
-  // kernelInfo["defines/" "p_Npmlfields"]= lbs->Npmlfields;
-  kernelInfo["defines/" "p_Nmacro"] = lbs->Nmacro;  
+  kernelInfo["defines/" "p_Nfields"]= Nfields;
+  // kernelInfo["defines/" "p_Npmlfields"]= Npmlfields;
+  kernelInfo["defines/" "p_Nmacro"] = Nmacro;
 
-  kernelInfo["defines/" "p_c"] = lbs->c;  
-  kernelInfo["defines/" "p_ic2"] = 1.0/ pow(lbs->c,2);  
-  kernelInfo["defines/" "p_ic4"] = 1.0/ pow(lbs->c,4);   
+  kernelInfo["defines/" "p_c"] = c;
+  kernelInfo["defines/" "p_ic2"] = 1.0/ pow(c,2);
+  kernelInfo["defines/" "p_ic4"] = 1.0/ pow(c,4);
 
-  int maxNodes = mymax(mesh.Np, (mesh.Nfp*mesh.Nfaces));
+  int maxNodes = std::max(mesh.Np, (mesh.Nfp*mesh.Nfaces));
   kernelInfo["defines/" "p_maxNodes"]= maxNodes;
 
   int blockMax = 256;
   if (platform.device.mode()=="CUDA") blockMax = 512;
 
-  int NblockV = mymax(1, blockMax/mesh.Np);
+  int NblockV = std::max(1, blockMax/mesh.Np);
   kernelInfo["defines/" "p_NblockV"]= NblockV;
 
-  int NblockS = mymax(1, blockMax/maxNodes);
+  int NblockS = std::max(1, blockMax/maxNodes);
   kernelInfo["defines/" "p_NblockS"]= NblockS;
 
-  kernelInfo["parser/" "automate-add-barriers"] =  "disabled";
-
   // set kernel name suffix
-  char *suffix;
-  if(mesh.elementType==TRIANGLES)
-    suffix = strdup("Tri2D");
-  if(mesh.elementType==QUADRILATERALS)
-    suffix = strdup("Quad2D");
-  if(mesh.elementType==TETRAHEDRA)
-    suffix = strdup("Tet3D");
-  if(mesh.elementType==HEXAHEDRA)
-    suffix = strdup("Hex3D");
-
-  char fileName[BUFSIZ], kernelName[BUFSIZ];
+  std::string suffix;
+  if(mesh.elementType==Mesh::TRIANGLES)
+    suffix = "Tri2D";
+  if(mesh.elementType==Mesh::QUADRILATERALS)
+    suffix = "Quad2D";
+  if(mesh.elementType==Mesh::TETRAHEDRA)
+    suffix = "Tet3D";
+  if(mesh.elementType==Mesh::HEXAHEDRA)
+    suffix = "Hex3D";
+
+  std::string oklFilePrefix = DLBS "/okl/";
+  std::string oklFileSuffix = ".okl";
+
+  std::string fileName, kernelName;
 
   if (mesh.dim==2) {
-    sprintf(fileName, DLBS "/okl/lbsInitialCondition2D.okl");
-    sprintf(kernelName, "lbsInitialCondition2D");
+    fileName   = oklFilePrefix + "lbsInitialCondition2D" + oklFileSuffix;
+    kernelName = "lbsInitialCondition2D";
   } else {
-    sprintf(fileName, DLBS "/okl/lbsInitialCondition3D.okl");
-    sprintf(kernelName, "lbsInitialCondition3D");
+    fileName   = oklFilePrefix + "lbsInitialCondition3D" + oklFileSuffix;
+    kernelName = "lbsInitialCondition3D";
   }
-  lbs->initialConditionKernel = platform.buildKernel(fileName, kernelName,
+  initialConditionKernel = platform.buildKernel(fileName, kernelName,
 						     kernelInfo);
   
   // kernels from volume file
-  sprintf(fileName, DLBS "/okl/lbsCollision%s.okl", suffix);  
+  fileName   = oklFilePrefix + "lbsCollision" + suffix + oklFileSuffix;
 
-  sprintf(kernelName, "lbsCollision%s", suffix);
-  lbs->collisionKernel =  platform.buildKernel(fileName, kernelName,
+  kernelName = "lbsCollision" + suffix;
+  collisionKernel =  platform.buildKernel(fileName, kernelName,
 					       kernelInfo);
 
-  sprintf(kernelName, "lbsForcing%s", suffix);
-  lbs->forcingKernel =  platform.buildKernel(fileName, kernelName,
+  kernelName = "lbsForcing" + suffix;
+  forcingKernel =  platform.buildKernel(fileName, kernelName,
 					     kernelInfo);
 
-  sprintf(kernelName, "lbsMoments%s", suffix);
-  lbs->momentsKernel =  platform.buildKernel(fileName, kernelName,
+  kernelName = "lbsMoments" + suffix;
+  momentsKernel =  platform.buildKernel(fileName, kernelName,
 					     kernelInfo);
 
-  sprintf(kernelName, "lbsPhaseField%s", suffix);
-  lbs->phaseFieldKernel =  platform.buildKernel(fileName, kernelName,
+  kernelName = "lbsPhaseField" + suffix;
+  phaseFieldKernel =  platform.buildKernel(fileName, kernelName,
 						kernelInfo);
 
   // kernels from volume file
-  sprintf(fileName, DLBS "/okl/lbsVolume%s.okl", suffix);
-  sprintf(kernelName, "lbsVolume%s", suffix);
-  lbs->volumeKernel =  platform.buildKernel(fileName, kernelName,
+  fileName   = oklFilePrefix + "lbsVolume" + suffix + oklFileSuffix;
+  kernelName = "lbsVolume" + suffix;
+  volumeKernel =  platform.buildKernel(fileName, kernelName,
 					    kernelInfo);
 
   // kernels from surface file
-  sprintf(fileName, DLBS "/okl/lbsSurface%s.okl", suffix);
-  
-  sprintf(kernelName, "lbsSurface%s", suffix);
-  lbs->surfaceKernel = platform.buildKernel(fileName, kernelName,
+  fileName   = oklFilePrefix + "lbsSurface" + suffix + oklFileSuffix;
+  kernelName = "lbsSurface" + suffix;
+  surfaceKernel = platform.buildKernel(fileName, kernelName,
 					    kernelInfo);
 
   // vorticity calculation
-  sprintf(fileName, DLBS "/okl/lbsVorticity%s.okl", suffix);
-  sprintf(kernelName, "lbsVorticity%s", suffix);
+  fileName   = oklFilePrefix + "lbsVorticity" + suffix + oklFileSuffix;
+  kernelName = "lbsVorticity" + suffix;
 
-  lbs->vorticityKernel = platform.buildKernel(fileName, kernelName,
+  vorticityKernel = platform.buildKernel(fileName, kernelName,
 					      kernelInfo);
-
-
-
-  return *lbs;
-}
-
-lbs_t::~lbs_t() {
-  volumeKernel.free();
-  surfaceKernel.free();
-  relaxationKernel.free();
-  pmlVolumeKernel.free();
-  pmlSurfaceKernel.free();
-  pmlRelaxationKernel.free();
-  vorticityKernel.free();
-  initialConditionKernel.free();
-
-  if (timeStepper) delete timeStepper;
-  if (traceHalo) traceHalo->Free();
-
-  for (int lev=0;lev<mesh.mrNlevels;lev++)
-    if (multirateTraceHalo[lev]) multirateTraceHalo[lev]->Free();
 }
diff --git a/solvers/lbs/src/lbsStep.cpp b/solvers/lbs/src/lbsStep.cpp
index cc7ab35d4..d0e861555 100644
--- a/solvers/lbs/src/lbsStep.cpp
+++ b/solvers/lbs/src/lbsStep.cpp
@@ -2,7 +2,7 @@
 
   The MIT License (MIT)
 
-  Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+  Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -32,53 +32,53 @@ dfloat lbs_t::MaxWaveSpeed(){
 }
 
 //evaluate ODE rhs = f(q,t)
-void lbs_t::rhsf(occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){
+void lbs_t::rhsf(deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS, const dfloat T){
 
   // extract q trace halo and start exchange
-  traceHalo->ExchangeStart(o_Q, 1, ogs_dfloat);
+  traceHalo.ExchangeStart(o_Q, 1);
 
   // compute volume contribution to lbs RHS
   rhsVolume(mesh.Nelements, o_Q, o_RHS, T);
 
   // complete trace halo exchange
-  traceHalo->ExchangeFinish(o_Q, 1, ogs_dfloat);
+  traceHalo.ExchangeFinish(o_Q, 1);
 
   // compute surface contribution to lbs RHS
   rhsSurface(mesh.Nelements, o_Q, o_RHS, T);
 }
 
-void lbs_t::rhsVolume(dlong N, occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){
+void lbs_t::rhsVolume(dlong N, deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS, const dfloat T){
 
   // compute volume contribution to lbs RHS
   if (N){
-    const dfloat dt    = timeStepper->GetTimeStep();
-    const dfloat gamma = alpha/timeStepper->GetTimeStep();
+    const dfloat dt    = timeStepper.GetTimeStep();
+    const dfloat gamma = alpha/timeStepper.GetTimeStep();
 
     forcingKernel(N,
-		  T,
-		  dt, 
-		  gamma,
-		  nu,
-		  o_LBM,
-		  mesh.o_x,
-		  mesh.o_y,
-		  mesh.o_z,
-		  o_Q,
-		  o_F,
-		  o_U);
+                  T,
+                  dt,
+                  gamma,
+                  nu,
+                  o_LBM,
+                  mesh.o_x,
+                  mesh.o_y,
+                  mesh.o_z,
+                  o_Q,
+                  o_F,
+                  o_U);
 
     collisionKernel(N,
-		    T,
-		    dt, 
-		    gamma,
-		    nu,
-		    o_LBM,
-		    mesh.o_x,
-		    mesh.o_y,
-		    mesh.o_z,
-		    o_F,
-		    o_U,
-		    o_Q);
+                    T,
+                    dt,
+                    gamma,
+                    nu,
+                    o_LBM,
+                    mesh.o_x,
+                    mesh.o_y,
+                    mesh.o_z,
+                    o_F,
+                    o_U,
+                    o_Q);
 
     volumeKernel(N,
                  mesh.o_vgeo,
@@ -97,9 +97,9 @@ void lbs_t::rhsVolume(dlong N, occa::memory& o_Q, occa::memory& o_RHS, const dfl
 }
 
 
-void lbs_t::rhsSurface(dlong N, occa::memory& o_Q, occa::memory& o_RHS, const dfloat T){
+void lbs_t::rhsSurface(dlong N, deviceMemory<dfloat>& o_Q, deviceMemory<dfloat>& o_RHS, const dfloat T){
   
-  const dfloat dt    = timeStepper->GetTimeStep();
+  const dfloat dt = timeStepper.GetTimeStep();
   // // compute volume contribution to lbs RHS
   if (N)
     surfaceKernel(N,
diff --git a/test/makefile b/test/makefile
index 3e9793b87..2e2b3da30 100644
--- a/test/makefile
+++ b/test/makefile
@@ -2,7 +2,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
diff --git a/test/test.py b/test/test.py
index dfd8608aa..f182435c8 100755
--- a/test/test.py
+++ b/test/test.py
@@ -4,7 +4,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
@@ -88,20 +88,20 @@ def __init__(self, name, value):
     self.name = name
     self.value = value
 
-def writeSetup(settings):
+def writeSetup(filename, settings):
   str_settings=""
   for setting in settings:
     str_settings += "[" + setting.name + "]\n"
     str_settings += str(setting.value) + "\n\n"
 
-  file = open("setup.rc", "w")
+  file = open(filename+".rc", "w")
   file.write(str_settings)
   file.close()
 
 def test(name, cmd, settings, referenceNorm, ranks=1):
 
   #create input file
-  writeSetup(settings)
+  writeSetup("setup",settings)
 
   #print test name
   print(bcolors.TEST + f"{name:.<{alignWidth}}" + bcolors.ENDC, end="", flush=True)
@@ -117,6 +117,8 @@ def test(name, cmd, settings, referenceNorm, ranks=1):
     print(run.stdout.decode())
     print(bcolors.WARNING + name + " stderr:" + bcolors.ENDC)
     print(run.stderr.decode())
+    #save the setup for reproducibility
+    writeSetup(name,settings)
     failed = 1
   else:
     #collect last line of output
@@ -133,6 +135,8 @@ def test(name, cmd, settings, referenceNorm, ranks=1):
         print(bcolors.FAIL + "FAIL" + bcolors.ENDC)
         print(bcolors.WARNING + "Expected Result: " + str(referenceNorm) + bcolors.ENDC)
         print(bcolors.WARNING + "Observed Result: " + str(norm) + bcolors.ENDC)
+        #save the setup for reproducibility
+        writeSetup(name,settings)
         failed = 1
     else:
       #this failure is worse, so dump the whole output for debug
@@ -141,8 +145,16 @@ def test(name, cmd, settings, referenceNorm, ranks=1):
       print(run.stdout.decode())
       print(bcolors.WARNING + name + " stderr:" + bcolors.ENDC)
       print(run.stderr.decode())
+      #save the setup for reproducibility
+      writeSetup(name,settings)
       failed = 1
 
+  # writeSetup(name,settings)
+  # print(bcolors.WARNING + name + " stdout:" + bcolors.ENDC)
+  # print(run.stdout.decode())
+  # print(bcolors.WARNING + name + " stderr:" + bcolors.ENDC)
+  # print(run.stderr.decode())
+
   #clean up
   os.remove(inputRC)
 
@@ -162,10 +174,12 @@ def test(name, cmd, settings, referenceNorm, ranks=1):
   import testTimeStepper
   import testLinearSolver
   import testParAlmond
+  import testParAdogs
   import testInitialGuess
 
   failCount=0;
   failCount+=testMesh.main()
+  failCount+=testParAdogs.main()
   failCount+=testGradient.main()
   failCount+=testAdvection.main()
   failCount+=testAcoustics.main()
diff --git a/test/testAcoustics.py b/test/testAcoustics.py
index 03c443dfc..51932ccb6 100755
--- a/test/testAcoustics.py
+++ b/test/testAcoustics.py
@@ -4,7 +4,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
diff --git a/test/testAdvection.py b/test/testAdvection.py
index 632864b0d..1f1fe4ceb 100755
--- a/test/testAdvection.py
+++ b/test/testAdvection.py
@@ -4,7 +4,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
diff --git a/test/testBns.py b/test/testBns.py
index ff684d498..73a39db5c 100755
--- a/test/testBns.py
+++ b/test/testBns.py
@@ -4,7 +4,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
diff --git a/test/testCns.py b/test/testCns.py
index 38f8bf173..226ce64bc 100755
--- a/test/testCns.py
+++ b/test/testCns.py
@@ -4,7 +4,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
diff --git a/test/testElliptic.py b/test/testElliptic.py
index 0c52a2d0a..79f7e73cc 100755
--- a/test/testElliptic.py
+++ b/test/testElliptic.py
@@ -4,7 +4,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
@@ -266,7 +266,7 @@ def main():
                     cmd=ellipticBin,
                     settings=ellipticSettings(element=6,data_file=ellipticData3D,dim=3,
                                               precon="NONE", discretization="IPDG"),
-                    referenceNorm=0.353553400508458)
+                    referenceNorm=0.353553400119087)
 
   failCount += test(name="testEllipticHex_Ipdg",
                     cmd=ellipticBin,
@@ -394,7 +394,7 @@ def main():
                     settings=ellipticSettings(element=6,data_file=ellipticData3D,dim=3,
                                               boundary_flag=-1, Lambda=0.0,
                                               discretization="IPDG"),
-                    referenceNorm=0.0595408371412352)
+                    referenceNorm=0.0595408272243646)
 
   failCount += test(name="testEllipticHex_Ipdg_AllNeumann",
                     cmd=ellipticBin,
diff --git a/test/testFokkerPlanck.py b/test/testFokkerPlanck.py
index 7dd989019..43c42e069 100755
--- a/test/testFokkerPlanck.py
+++ b/test/testFokkerPlanck.py
@@ -4,7 +4,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
diff --git a/test/testGradient.py b/test/testGradient.py
index ff00b56bd..c170a878c 100755
--- a/test/testGradient.py
+++ b/test/testGradient.py
@@ -4,7 +4,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
@@ -34,6 +34,7 @@
 def gradientSettings(rcformat="2.0", data_file=gradientData2D,
                      mesh="BOX", dim=2, element=4, nx=10, ny=10, nz=10, boundary_flag=1,
                      degree=4, thread_model=device, platform_number=0, device_number=0,
+                     paradogs_partitioning="NONE",
                      output_to_file="FALSE"):
   return [setting_t("FORMAT", rcformat),
           setting_t("DATA FILE", data_file),
@@ -48,6 +49,7 @@ def gradientSettings(rcformat="2.0", data_file=gradientData2D,
           setting_t("THREAD MODEL", thread_model),
           setting_t("PLATFORM NUMBER", platform_number),
           setting_t("DEVICE NUMBER", device_number),
+          setting_t("PARADOGS PARTITIONING", paradogs_partitioning),
           setting_t("OUTPUT TO FILE", output_to_file)]
 
 def main():
@@ -83,4 +85,4 @@ def main():
 if __name__ == "__main__":
   failCount=0;
   failCount+=main()
-  sys.exit(failCount)
\ No newline at end of file
+  sys.exit(failCount)
diff --git a/test/testInitialGuess.py b/test/testInitialGuess.py
index 0457af9e6..11836c053 100755
--- a/test/testInitialGuess.py
+++ b/test/testInitialGuess.py
@@ -4,7 +4,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
diff --git a/test/testIns.py b/test/testIns.py
index fad570f28..72659b662 100755
--- a/test/testIns.py
+++ b/test/testIns.py
@@ -4,7 +4,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
diff --git a/test/testLbs.py b/test/testLbs.py
old mode 100644
new mode 100755
index bb0bb6a80..842f5ffff
--- a/test/testLbs.py
+++ b/test/testLbs.py
@@ -4,7 +4,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
diff --git a/test/testLinearSolver.py b/test/testLinearSolver.py
index 522cd33c7..f899eedc6 100755
--- a/test/testLinearSolver.py
+++ b/test/testLinearSolver.py
@@ -4,7 +4,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
diff --git a/test/testMesh.py b/test/testMesh.py
index b9de9054e..b356db09d 100755
--- a/test/testMesh.py
+++ b/test/testMesh.py
@@ -4,7 +4,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
@@ -118,4 +118,4 @@ def main():
 if __name__ == "__main__":
   failCount=0;
   failCount+=main()
-  sys.exit(failCount)
\ No newline at end of file
+  sys.exit(failCount)
diff --git a/test/testParAdogs.py b/test/testParAdogs.py
new file mode 100755
index 000000000..269582268
--- /dev/null
+++ b/test/testParAdogs.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+#####################################################################################
+#
+#The MIT License (MIT)
+#
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#
+#Permission is hereby granted, free of charge, to any person obtaining a copy
+#of this software and associated documentation files (the "Software"), to deal
+#in the Software without restriction, including without limitation the rights
+#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#copies of the Software, and to permit persons to whom the Software is
+#furnished to do so, subject to the following conditions:
+#
+#The above copyright notice and this permission notice shall be included in all
+#copies or substantial portions of the Software.
+#
+#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+#SOFTWARE.
+#
+#####################################################################################
+
+from test import *
+from testGradient import *
+
+def main():
+  failCount=0;
+
+  failCount += test(name="testParAdogsTri_Inertial_MPI", ranks=2,
+                    cmd=gradientBin,
+                    settings=gradientSettings(element=3,data_file=gradientData2D,dim=2,
+                                              mesh=testDir+"/squareTri.msh",
+                                              paradogs_partitioning="INERTIAL"),
+                    referenceNorm=0.580787485719841)
+
+  failCount += test(name="testParAdogsQuad_Inertial_MPI", ranks=2,
+                    cmd=gradientBin,
+                    settings=gradientSettings(element=4,data_file=gradientData2D,dim=2,
+                                              mesh=testDir+"/squareQuad.msh",
+                                              paradogs_partitioning="INERTIAL"),
+                    referenceNorm=0.580787485654967)
+
+  failCount += test(name="testParAdogsTet_Inertial_MPI", ranks=2,
+                    cmd=gradientBin,
+                    settings=gradientSettings(element=6,data_file=gradientData3D,dim=3,
+                                              mesh=testDir+"/cubeTet.msh",
+                                              paradogs_partitioning="INERTIAL"),
+                    referenceNorm=0.942816947760423)
+
+  failCount += test(name="testParAdogsHex_Inertial_MPI", ranks=2,
+                    cmd=gradientBin,
+                    settings=gradientSettings(element=12,data_file=gradientData3D,dim=3,
+                                              mesh=testDir+"/cubeHex.msh",
+                                              paradogs_partitioning="INERTIAL"),
+                    referenceNorm=0.942816869518335)
+
+  failCount += test(name="testParAdogsTri_Spectral_MPI", ranks=2,
+                    cmd=gradientBin,
+                    settings=gradientSettings(element=3,data_file=gradientData2D,dim=2,
+                                              mesh=testDir+"/squareTri.msh",
+                                              paradogs_partitioning="SPECTRAL"),
+                    referenceNorm=0.580787485719841)
+
+  failCount += test(name="testParAdogsQuad_Spectral_MPI", ranks=2,
+                    cmd=gradientBin,
+                    settings=gradientSettings(element=4,data_file=gradientData2D,dim=2,
+                                              mesh=testDir+"/squareQuad.msh",
+                                              paradogs_partitioning="SPECTRAL"),
+                    referenceNorm=0.580787485654967)
+
+  failCount += test(name="testParAdogsTet_Spectral_MPI", ranks=2,
+                    cmd=gradientBin,
+                    settings=gradientSettings(element=6,data_file=gradientData3D,dim=3,
+                                              mesh=testDir+"/cubeTet.msh",
+                                              paradogs_partitioning="SPECTRAL"),
+                    referenceNorm=0.942816947760423)
+
+  failCount += test(name="testParAdogsHex_Spectral_MPI", ranks=2,
+                    cmd=gradientBin,
+                    settings=gradientSettings(element=12,data_file=gradientData3D,dim=3,
+                                              mesh=testDir+"/cubeHex.msh",
+                                              paradogs_partitioning="SPECTRAL"),
+                    referenceNorm=0.942816869518335)
+
+  return failCount
+
+if __name__ == "__main__":
+  failCount=0;
+  failCount+=main()
+  sys.exit(failCount)
diff --git a/test/testParAlmond.py b/test/testParAlmond.py
index d251f65c8..3c41249a6 100755
--- a/test/testParAlmond.py
+++ b/test/testParAlmond.py
@@ -4,7 +4,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
@@ -105,4 +105,4 @@ def main():
 if __name__ == "__main__":
   failCount=0;
   failCount+=main()
-  sys.exit(failCount)
\ No newline at end of file
+  sys.exit(failCount)
diff --git a/test/testTimeStepper.py b/test/testTimeStepper.py
index b7af95feb..edf20ab4b 100755
--- a/test/testTimeStepper.py
+++ b/test/testTimeStepper.py
@@ -4,7 +4,7 @@
 #
 #The MIT License (MIT)
 #
-#Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+#Copyright (c) 2017-2022 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 #
 #Permission is hereby granted, free of charge, to any person obtaining a copy
 #of this software and associated documentation files (the "Software"), to deal
@@ -117,4 +117,4 @@ def main():
 if __name__ == "__main__":
   failCount=0;
   failCount+=main()
-  sys.exit(failCount)
\ No newline at end of file
+  sys.exit(failCount)